From 66065a50d7282fd79ca91da6f4f4d2897e301e4b Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Fri, 10 Mar 2023 10:22:26 -0500
Subject: [PATCH 1/4] Fix CPU error always being raised (#1175)

* Save state

* Revert to old behavior

* Fix failing test/update

* Remove duplicate test
---
 src/accelerate/state.py   | 17 +++++++----------
 tests/test_accelerator.py |  9 ++++++++-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/accelerate/state.py b/src/accelerate/state.py
index d1d0dec12c5..726c0ea7247 100644
--- a/src/accelerate/state.py
+++ b/src/accelerate/state.py
@@ -78,13 +78,6 @@ class PartialState:
 
     def __init__(self, cpu: bool = False, **kwargs):
         self.__dict__ = self._shared_state
-        # Raise an error if the user tries to reinitialize on a different device setup in the same launch
-        if self.initialized and (self._cpu != cpu):
-            raise AssertionError(
-                "The current device and desired device are not the same. If the `PartialState` was generated "
-                "before the `Accelerator` has been instantiated, ensure the `cpu` flag is the same for both. In this case, "
-                f"the `PartialState` has {self._cpu} and the desired device is {cpu}. Please use `cpu={self._cpu}`."
-            )
         if not self.initialized:
             self._cpu = cpu
             self.backend = None
@@ -540,10 +533,12 @@ def __init__(
         **kwargs,
     ):
         self.__dict__ = self._shared_state
-        if PartialState._shared_state == {} or (cpu != PartialState._shared_state.get("_cpu", False)):
+        if parse_flag_from_env("ACCELERATE_USE_CPU"):
+            cpu = True
+        if PartialState._shared_state == {}:
             PartialState(cpu, **kwargs)
         self.__dict__.update(PartialState._shared_state)
-        self._check_initialized(mixed_precision)
+        self._check_initialized(mixed_precision, cpu)
         if not self.initialized:
             self.deepspeed_plugin = None
             mixed_precision = (
@@ -599,10 +594,12 @@ def __repr__(self):
             repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n"
         return repr
 
-    def _check_initialized(self, mixed_precision=None):
+    def _check_initialized(self, mixed_precision=None, cpu=None):
         "Checks if a modification is trying to be made and the `AcceleratorState` has already been initialized"
         if self.initialized:
             err = "AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `{flag}` to `Accelerator()`."
+            if cpu and self.device.type != "cpu":
+                raise ValueError(err.format(flag="cpu=True"))
             if (
                 mixed_precision is not None
                 and mixed_precision != self._mixed_precision
diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py
index d6b74fc1557..9c846639855 100644
--- a/tests/test_accelerator.py
+++ b/tests/test_accelerator.py
@@ -40,7 +40,7 @@ def test_accelerator_can_be_reinstantiated(self):
         _ = Accelerator()
         assert PartialState._shared_state["_cpu"] is False
         assert PartialState._shared_state["device"].type == "cuda"
-        with self.assertRaises(AssertionError):
+        with self.assertRaises(ValueError):
             _ = Accelerator(cpu=True)
 
     def test_prepared_objects_are_referenced(self):
@@ -226,3 +226,10 @@ def test_accelerator_bnb_multi_gpu(self):
         # This should not work and get value error
         with self.assertRaises(ValueError):
             _ = accelerator.prepare(model)
+
+    @require_cuda
+    def test_accelerator_cpu_flag_prepare(self):
+        model = torch.nn.Linear(10, 10)
+        sgd = torch.optim.SGD(model.parameters(), lr=0.01)
+        accelerator = Accelerator(cpu=True)
+        _ = accelerator.prepare(sgd)

From 8dec01a7b851f1d0007f516c02be902a5f936203 Mon Sep 17 00:00:00 2001
From: Saarthak S <hackpert@gmail.com>
Date: Fri, 10 Mar 2023 18:36:52 -0500
Subject: [PATCH 2/4] fixed typo in launch.py tpu_pod_launcher (#1180)

---
 src/accelerate/commands/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index abd9ea4dc0d..4e6c63d96e7 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -707,7 +707,7 @@ def tpu_pod_launcher(args):
         str(1),
         "--mixed_precision",
         "no",
-        "--dynmo_backend",
+        "--dynamo_backend",
         "no",
         "--num_processes",
         str(args.num_processes),

From 3b3605e01c93589f48ac8eb3e7852cbc372bfcfa Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 13 Mar 2023 10:48:31 -0400
Subject: [PATCH 3/4] Support special mapping of dtypes when preparing device
 map (#1179)

---
 src/accelerate/utils/modeling.py | 35 +++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
index 725b782ca1a..fe5eee0a6e5 100644
--- a/src/accelerate/utils/modeling.py
+++ b/src/accelerate/utils/modeling.py
@@ -259,19 +259,36 @@ def retie_parameters(model, tied_params):
         setattr(tied_module, tied_param_name.split(".")[-1], param)
 
 
-def compute_module_sizes(model: nn.Module, dtype: Optional[Union[str, torch.device]] = None):
+def _get_proper_dtype(dtype: Union[str, torch.device]) -> torch.dtype:
     """
-    Compute the size of each submodule of a given model.
+    Just does torch.dtype(dtype) if necessary.
     """
     if isinstance(dtype, str):
         # We accept "torch.float16" or just "float16"
         dtype = dtype.replace("torch.", "")
         dtype = getattr(torch, dtype)
+    return dtype
+
+
+def compute_module_sizes(
+    model: nn.Module,
+    dtype: Optional[Union[str, torch.device]] = None,
+    special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
+):
+    """
+    Compute the size of each submodule of a given model.
+    """
     if dtype is not None:
+        dtype = _get_proper_dtype(dtype)
         dtype_size = dtype_byte_size(dtype)
+    if special_dtypes is not None:
+        special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
+        special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
     module_sizes = defaultdict(int)
     for name, tensor in named_module_tensors(model, recurse=True):
-        if dtype is None:
+        if special_dtypes is not None and name in special_dtypes:
+            size = tensor.numel() * special_dtypes_size[name]
+        elif dtype is None:
             size = tensor.numel() * dtype_byte_size(tensor.dtype)
         else:
             size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
@@ -394,6 +411,7 @@ def get_balanced_memory(
     max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
     no_split_module_classes: Optional[List[str]] = None,
     dtype: Optional[Union[str, torch.dtype]] = None,
+    special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
     low_zero: bool = False,
 ):
     """
@@ -416,6 +434,9 @@ def get_balanced_memory(
             residual connection).
         dtype (`str` or `torch.dtype`, *optional*):
             If provided, the weights will be converted to that type when loaded.
+        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
+            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
+            all weights).
         low_zero (`bool`, *optional*):
             Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the
             Transformers generate function).
@@ -427,7 +448,7 @@ def get_balanced_memory(
         return max_memory
 
     num_devices = len([d for d in max_memory if torch.device(d).type == "cuda" and max_memory[d] > 0])
-    module_sizes = compute_module_sizes(model, dtype=dtype)
+    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
     per_gpu = module_sizes[""] // (num_devices - 1 if low_zero else num_devices)
 
     # We can't just set the memory to model_size // num_devices as it will end being too small: each GPU will get
@@ -486,6 +507,7 @@ def infer_auto_device_map(
     max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
     no_split_module_classes: Optional[List[str]] = None,
     dtype: Optional[Union[str, torch.dtype]] = None,
+    special_dtypes: Optional[Dict[str, Union[str, torch.dtype]]] = None,
 ):
     """
     Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk,
@@ -514,6 +536,9 @@ def infer_auto_device_map(
             residual connection).
         dtype (`str` or `torch.dtype`, *optional*):
             If provided, the weights will be converted to that type when loaded.
+        special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*):
+            If provided, special dtypes to consider for some specific weights (will override dtype used as default for
+            all weights).
     """
     # Get default / clean up max_memory
     max_memory = get_max_memory(max_memory)
@@ -530,7 +555,7 @@ def infer_auto_device_map(
     # Devices that need to keep space for a potential offloaded layer.
     main_devices = [gpus[0], "cpu"] if len(gpus) > 0 else ["cpu"]
 
-    module_sizes = compute_module_sizes(model, dtype=dtype)
+    module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes)
     tied_parameters = find_tied_parameters(model)
 
     device_map = {}

From c266cf064829702d05c48f2fbacfb5c1e39c38e9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 13 Mar 2023 16:48:03 -0400
Subject: [PATCH 4/4] Patch release: v0.17.1

---
 setup.py                   | 2 +-
 src/accelerate/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 28981c269df..cfb9d585662 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
 
 setup(
     name="accelerate",
-    version="0.17.0",
+    version="0.17.1",
     description="Accelerate",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/accelerate/__init__.py b/src/accelerate/__init__.py
index 7345c40cb4d..16bd8a09da9 100644
--- a/src/accelerate/__init__.py
+++ b/src/accelerate/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.17.0"
+__version__ = "0.17.1"
 
 from .accelerator import Accelerator
 from .big_modeling import (