From 66065a50d7282fd79ca91da6f4f4d2897e301e4b Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Fri, 10 Mar 2023 10:22:26 -0500 Subject: [PATCH 1/4] Fix CPU error always being raised (#1175) * Save state * Revert to old behavior * Fix failing test/update * Remove duplicate test --- src/accelerate/state.py | 17 +++++++---------- tests/test_accelerator.py | 9 ++++++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/accelerate/state.py b/src/accelerate/state.py index d1d0dec12c5..726c0ea7247 100644 --- a/src/accelerate/state.py +++ b/src/accelerate/state.py @@ -78,13 +78,6 @@ class PartialState: def __init__(self, cpu: bool = False, **kwargs): self.__dict__ = self._shared_state - # Raise an error if the user tries to reinitialize on a different device setup in the same launch - if self.initialized and (self._cpu != cpu): - raise AssertionError( - "The current device and desired device are not the same. If the `PartialState` was generated " - "before the `Accelerator` has been instantiated, ensure the `cpu` flag is the same for both. In this case, " - f"the `PartialState` has {self._cpu} and the desired device is {cpu}. Please use `cpu={self._cpu}`." - ) if not self.initialized: self._cpu = cpu self.backend = None @@ -540,10 +533,12 @@ def __init__( **kwargs, ): self.__dict__ = self._shared_state - if PartialState._shared_state == {} or (cpu != PartialState._shared_state.get("_cpu", False)): + if parse_flag_from_env("ACCELERATE_USE_CPU"): + cpu = True + if PartialState._shared_state == {}: PartialState(cpu, **kwargs) self.__dict__.update(PartialState._shared_state) - self._check_initialized(mixed_precision) + self._check_initialized(mixed_precision, cpu) if not self.initialized: self.deepspeed_plugin = None mixed_precision = ( @@ -599,10 +594,12 @@ def __repr__(self): repr += f"ds_config: {self.deepspeed_plugin.deepspeed_config}\n" return repr - def _check_initialized(self, mixed_precision=None): + def _check_initialized(self, mixed_precision=None, cpu=None): "Checks if a modification is trying to be made and the `AcceleratorState` has already been initialized" if self.initialized: err = "AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `{flag}` to `Accelerator()`." + if cpu and self.device.type != "cpu": + raise ValueError(err.format(flag="cpu=True")) if ( mixed_precision is not None and mixed_precision != self._mixed_precision diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py index d6b74fc1557..9c846639855 100644 --- a/tests/test_accelerator.py +++ b/tests/test_accelerator.py @@ -40,7 +40,7 @@ def test_accelerator_can_be_reinstantiated(self): _ = Accelerator() assert PartialState._shared_state["_cpu"] is False assert PartialState._shared_state["device"].type == "cuda" - with self.assertRaises(AssertionError): + with self.assertRaises(ValueError): _ = Accelerator(cpu=True) def test_prepared_objects_are_referenced(self): @@ -226,3 +226,10 @@ def test_accelerator_bnb_multi_gpu(self): # This should not work and get value error with self.assertRaises(ValueError): _ = accelerator.prepare(model) + + @require_cuda + def test_accelerator_cpu_flag_prepare(self): + model = torch.nn.Linear(10, 10) + sgd = torch.optim.SGD(model.parameters(), lr=0.01) + accelerator = Accelerator(cpu=True) + _ = accelerator.prepare(sgd) From 8dec01a7b851f1d0007f516c02be902a5f936203 Mon Sep 17 00:00:00 2001 From: Saarthak S Date: Fri, 10 Mar 2023 18:36:52 -0500 Subject: [PATCH 2/4] fixed typo in launch.py tpu_pod_launcher (#1180) --- src/accelerate/commands/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index abd9ea4dc0d..4e6c63d96e7 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -707,7 +707,7 @@ def tpu_pod_launcher(args): str(1), "--mixed_precision", "no", - "--dynmo_backend", + "--dynamo_backend", "no", "--num_processes", str(args.num_processes), From 3b3605e01c93589f48ac8eb3e7852cbc372bfcfa Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 13 Mar 2023 10:48:31 -0400 Subject: [PATCH 3/4] Support special mapping of dtypes when preparing device map (#1179) --- src/accelerate/utils/modeling.py | 35 +++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 725b782ca1a..fe5eee0a6e5 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -259,19 +259,36 @@ def retie_parameters(model, tied_params): setattr(tied_module, tied_param_name.split(".")[-1], param) -def compute_module_sizes(model: nn.Module, dtype: Optional[Union[str, torch.device]] = None): +def _get_proper_dtype(dtype: Union[str, torch.device]) -> torch.dtype: """ - Compute the size of each submodule of a given model. + Just does torch.dtype(dtype) if necessary. """ if isinstance(dtype, str): # We accept "torch.float16" or just "float16" dtype = dtype.replace("torch.", "") dtype = getattr(torch, dtype) + return dtype + + +def compute_module_sizes( + model: nn.Module, + dtype: Optional[Union[str, torch.device]] = None, + special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None, +): + """ + Compute the size of each submodule of a given model. + """ if dtype is not None: + dtype = _get_proper_dtype(dtype) dtype_size = dtype_byte_size(dtype) + if special_dtypes is not None: + special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()} + special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()} module_sizes = defaultdict(int) for name, tensor in named_module_tensors(model, recurse=True): - if dtype is None: + if special_dtypes is not None and name in special_dtypes: + size = tensor.numel() * special_dtypes_size[name] + elif dtype is None: size = tensor.numel() * dtype_byte_size(tensor.dtype) else: size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype)) @@ -394,6 +411,7 @@ def get_balanced_memory( max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None, no_split_module_classes: Optional[List[str]] = None, dtype: Optional[Union[str, torch.dtype]] = None, + special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None, low_zero: bool = False, ): """ @@ -416,6 +434,9 @@ def get_balanced_memory( residual connection). dtype (`str` or `torch.dtype`, *optional*): If provided, the weights will be converted to that type when loaded. + special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*): + If provided, special dtypes to consider for some specific weights (will override dtype used as default for + all weights). low_zero (`bool`, *optional*): Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the Transformers generate function). @@ -427,7 +448,7 @@ def get_balanced_memory( return max_memory num_devices = len([d for d in max_memory if torch.device(d).type == "cuda" and max_memory[d] > 0]) - module_sizes = compute_module_sizes(model, dtype=dtype) + module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes) per_gpu = module_sizes[""] // (num_devices - 1 if low_zero else num_devices) # We can't just set the memory to model_size // num_devices as it will end being too small: each GPU will get @@ -486,6 +507,7 @@ def infer_auto_device_map( max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None, no_split_module_classes: Optional[List[str]] = None, dtype: Optional[Union[str, torch.dtype]] = None, + special_dtypes: Optional[Dict[str, Union[str, torch.dtype]]] = None, ): """ Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk, @@ -514,6 +536,9 @@ def infer_auto_device_map( residual connection). dtype (`str` or `torch.dtype`, *optional*): If provided, the weights will be converted to that type when loaded. + special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*): + If provided, special dtypes to consider for some specific weights (will override dtype used as default for + all weights). """ # Get default / clean up max_memory max_memory = get_max_memory(max_memory) @@ -530,7 +555,7 @@ def infer_auto_device_map( # Devices that need to keep space for a potential offloaded layer. main_devices = [gpus[0], "cpu"] if len(gpus) > 0 else ["cpu"] - module_sizes = compute_module_sizes(model, dtype=dtype) + module_sizes = compute_module_sizes(model, dtype=dtype, special_dtypes=special_dtypes) tied_parameters = find_tied_parameters(model) device_map = {} From c266cf064829702d05c48f2fbacfb5c1e39c38e9 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger Date: Mon, 13 Mar 2023 16:48:03 -0400 Subject: [PATCH 4/4] Patch release: v0.17.1 --- setup.py | 2 +- src/accelerate/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 28981c269df..cfb9d585662 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup( name="accelerate", - version="0.17.0", + version="0.17.1", description="Accelerate", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/accelerate/__init__.py b/src/accelerate/__init__.py index 7345c40cb4d..16bd8a09da9 100644 --- a/src/accelerate/__init__.py +++ b/src/accelerate/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.17.0" +__version__ = "0.17.1" from .accelerator import Accelerator from .big_modeling import (