Merge branch 'main' into patch-1

YuvalRingel · web-flow · commit fdbf43d82fae · 2022-05-31T18:24:29.000+03:00
diff --git a/slowfast/config/defaults.py b/slowfast/config/defaults.py
@@ -110,10 +110,19 @@
 # NUM_SPLITS splits, and run BN on each of them separately independently.
 _C.BN.NUM_SPLITS = 1
 
-# Parameter for NaiveSyncBatchNorm3d, where the stats across `NUM_SYNC_DEVICES`
-# devices will be synchronized.
+# Parameter for NaiveSyncBatchNorm, where the stats across `NUM_SYNC_DEVICES`
+# devices will be synchronized. `NUM_SYNC_DEVICES` cannot be larger than number of
+# devices per machine; if global sync is desired, set `GLOBAL_SYNC`.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
 _C.BN.NUM_SYNC_DEVICES = 1
 
+# Parameter for NaiveSyncBatchNorm. Setting `GLOBAL_SYNC` to True synchronizes
+# stats across all devices, across all machines; in this case, `NUM_SYNC_DEVICES`
+# must be set to None.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
+_C.BN.GLOBAL_SYNC = False
 
 # ---------------------------------------------------------------------------- #
 # Training options.
diff --git a/slowfast/models/batchnorm_helper.py b/slowfast/models/batchnorm_helper.py
@@ -4,12 +4,10 @@
 """BatchNorm (BN) utility functions and custom batch-size BN implementations"""
 
 from functools import partial
+
 import torch
-import torch.distributed as dist
 import torch.nn as nn
-from torch.autograd.function import Function
-
-import slowfast.utils.distributed as du
+from pytorchvideo.layers.batch_norm import NaiveSyncBatchNorm3d, NaiveSyncBatchNorm1d  # noqa
 
 
 def get_norm(cfg):
@@ -26,7 +24,8 @@ def get_norm(cfg):
         return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
     elif cfg.BN.NORM_TYPE == "sync_batchnorm":
         return partial(
-            NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES
+            NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES,
+            global_sync=cfg.BN.GLOBAL_SYNC
         )
     else:
         raise NotImplementedError(
@@ -107,159 +106,3 @@ def forward(self, x):
             x = x * self.weight.view((-1, 1, 1, 1))
             x = x + self.bias.view((-1, 1, 1, 1))
         return x
-
-
-class GroupGather(Function):
-    """
-    GroupGather performs all gather on each of the local process/ GPU groups.
-    """
-
-    @staticmethod
-    def forward(ctx, input, num_sync_devices, num_groups):
-        """
-        Perform forwarding, gathering the stats across different process/ GPU
-        group.
-        """
-        ctx.num_sync_devices = num_sync_devices
-        ctx.num_groups = num_groups
-
-        input_list = [
-            torch.zeros_like(input) for k in range(du.get_local_size())
-        ]
-        dist.all_gather(
-            input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP
-        )
-
-        inputs = torch.stack(input_list, dim=0)
-        if num_groups > 1:
-            rank = du.get_local_rank()
-            group_idx = rank // num_sync_devices
-            inputs = inputs[
-                group_idx
-                * num_sync_devices : (group_idx + 1)
-                * num_sync_devices
-            ]
-        inputs = torch.sum(inputs, dim=0)
-        return inputs
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        Perform backwarding, gathering the gradients across different process/ GPU
-        group.
-        """
-        grad_output_list = [
-            torch.zeros_like(grad_output) for k in range(du.get_local_size())
-        ]
-        dist.all_gather(
-            grad_output_list,
-            grad_output,
-            async_op=False,
-            group=du._LOCAL_PROCESS_GROUP,
-        )
-
-        grads = torch.stack(grad_output_list, dim=0)
-        if ctx.num_groups > 1:
-            rank = du.get_local_rank()
-            group_idx = rank // ctx.num_sync_devices
-            grads = grads[
-                group_idx
-                * ctx.num_sync_devices : (group_idx + 1)
-                * ctx.num_sync_devices
-            ]
-        grads = torch.sum(grads, dim=0)
-        return grads, None, None
-
-
-class NaiveSyncBatchNorm3d(nn.BatchNorm3d):
-    def __init__(self, num_sync_devices, **args):
-        """
-        Naive version of Synchronized 3D BatchNorm.
-        Args:
-            num_sync_devices (int): number of device to sync.
-            args (list): other arguments.
-        """
-        self.num_sync_devices = num_sync_devices
-        if self.num_sync_devices > 0:
-            assert du.get_local_size() % self.num_sync_devices == 0, (
-                du.get_local_size(),
-                self.num_sync_devices,
-            )
-            self.num_groups = du.get_local_size() // self.num_sync_devices
-        else:
-            self.num_sync_devices = du.get_local_size()
-            self.num_groups = 1
-        super(NaiveSyncBatchNorm3d, self).__init__(**args)
-
-    def forward(self, input):
-        if du.get_local_size() == 1 or not self.training:
-            return super().forward(input)
-
-        assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
-        C = input.shape[1]
-        mean = torch.mean(input, dim=[0, 2, 3, 4])
-        meansqr = torch.mean(input * input, dim=[0, 2, 3, 4])
-
-        vec = torch.cat([mean, meansqr], dim=0)
-        vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * (
-            1.0 / self.num_sync_devices
-        )
-
-        mean, meansqr = torch.split(vec, C)
-        var = meansqr - mean * mean
-        self.running_mean += self.momentum * (mean.detach() - self.running_mean)
-        self.running_var += self.momentum * (var.detach() - self.running_var)
-
-        invstd = torch.rsqrt(var + self.eps)
-        scale = self.weight * invstd
-        bias = self.bias - mean * scale
-        scale = scale.reshape(1, -1, 1, 1, 1)
-        bias = bias.reshape(1, -1, 1, 1, 1)
-        return input * scale + bias
-
-
-class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
-    def __init__(self, num_sync_devices, **args):
-        """
-        Naive version of Synchronized 1D BatchNorm.
-        Args:
-            num_sync_devices (int): number of device to sync.
-            args (list): other arguments.
-        """
-        self.num_sync_devices = num_sync_devices
-        if self.num_sync_devices > 0:
-            assert du.get_local_size() % self.num_sync_devices == 0, (
-                du.get_local_size(),
-                self.num_sync_devices,
-            )
-            self.num_groups = du.get_local_size() // self.num_sync_devices
-        else:
-            self.num_sync_devices = du.get_local_size()
-            self.num_groups = 1
-        super(NaiveSyncBatchNorm1d, self).__init__(**args)
-
-    def forward(self, input):
-        if du.get_local_size() == 1 or not self.training:
-            return super().forward(input)
-
-        assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
-        C = input.shape[1]
-        mean = torch.mean(input, dim=[0])
-        meansqr = torch.mean(input * input, dim=[0])
-
-        vec = torch.cat([mean, meansqr], dim=0)
-        vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * (
-            1.0 / self.num_sync_devices
-        )
-
-        mean, meansqr = torch.split(vec, C)
-        var = meansqr - mean * mean
-        self.running_mean += self.momentum * (mean.detach() - self.running_mean)
-        self.running_var += self.momentum * (var.detach() - self.running_var)
-
-        invstd = torch.rsqrt(var + self.eps)
-        scale = self.weight * invstd
-        bias = self.bias - mean * scale
-        scale = scale.reshape(1, -1)
-        bias = bias.reshape(1, -1)
-        return input * scale + bias
diff --git a/slowfast/models/head_helper.py b/slowfast/models/head_helper.py
@@ -155,6 +155,7 @@ def __init__(
         flatten=False,
         xavier_init=True,
         bn_sync_num=1,
+        global_sync=False,
     ):
         super(MLPHead, self).__init__()
         self.flatten = flatten
@@ -164,10 +165,12 @@ def __init__(
         mlp_layers[-1].xavier_init = xavier_init
         for i in range(1, num_layers):
             if bn_on:
-                if bn_sync_num > 1:
+                if global_sync or bn_sync_num > 1:
                     mlp_layers.append(
                         NaiveSyncBatchNorm1d(
-                            num_sync_devices=bn_sync_num, num_features=mlp_dim
+                            num_sync_devices=bn_sync_num,
+                            global_sync=global_sync,
+                            num_features=mlp_dim
                         )
                     )
                 else:
@@ -266,6 +269,10 @@ def __init__(
                 bn_sync_num=cfg.BN.NUM_SYNC_DEVICES
                 if cfg.CONTRASTIVE.BN_SYNC_MLP
                 else 1,
+                global_sync=(
+                    cfg.CONTRASTIVE.BN_SYNC_MLP and
+                    cfg.BN.GLOBAL_SYNC
+                    ),
             )
 
         # Softmax for evaluation and testing.
@@ -294,6 +301,10 @@ def __init__(
                     bn_sync_num=cfg.BN.NUM_SYNC_DEVICES
                     if cfg.CONTRASTIVE.BN_SYNC_MLP
                     else 1,
+                    global_sync=(
+                        cfg.CONTRASTIVE.BN_SYNC_MLP and
+                        cfg.BN.GLOBAL_SYNC
+                        ),
                 )
                 self.predictors.append(local_mlp)
 
@@ -525,6 +536,10 @@ def __init__(
                 bn_sync_num=cfg.BN.NUM_SYNC_DEVICES
                 if cfg.CONTRASTIVE.BN_SYNC_MLP
                 else 1,
+                global_sync=(
+                    cfg.CONTRASTIVE.BN_SYNC_MLP and
+                    cfg.BN.GLOBAL_SYNC
+                    ),
             )
         self.detach_final_fc = cfg.MODEL.DETACH_FINAL_FC
 
diff --git a/slowfast/models/optimizer.py b/slowfast/models/optimizer.py
@@ -115,6 +115,13 @@ def construct_optimizer(model, cfg):
             eps=1e-08,
             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
         )
+    elif cfg.SOLVER.OPTIMIZING_METHOD == "mt_adamw":
+        optimizer = torch.optim._multi_tensor.AdamW(
+            optim_params,
+            lr=cfg.SOLVER.BASE_LR,
+            eps=1e-08,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
+        )
     else:
         raise NotImplementedError(
             "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD)
diff --git a/slowfast/utils/distributed.py b/slowfast/utils/distributed.py
@@ -9,25 +9,14 @@
 import torch
 import torch.distributed as dist
 
-_LOCAL_PROCESS_GROUP = None
-
-
-def cat_all_gather(tensors, local=False):
-    """Performs the concatenated all_reduce operation on the provided tensors."""
-    if local:
-        gather_sz = get_local_size()
-    else:
-        gather_sz = torch.distributed.get_world_size()
-    tensors_gather = [torch.ones_like(tensors) for _ in range(gather_sz)]
-    torch.distributed.all_gather(
-        tensors_gather,
-        tensors,
-        async_op=False,
-        group=_LOCAL_PROCESS_GROUP if local else None,
-    )
-    output = torch.cat(tensors_gather, dim=0)
-    return output
-
+from pytorchvideo.layers.distributed import (  # noqa
+    get_world_size,
+    cat_all_gather,
+    init_distributed_training,
+    get_local_size,
+    get_local_rank,
+    get_local_process_group,
+)
 
 def all_gather(tensors):
     """
@@ -128,17 +117,6 @@ def is_root_proc():
         return True
 
 
-def get_world_size():
-    """
-    Get the size of the world.
-    """
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size()
-
-
 def get_rank():
     """
     Get the rank of the current process.
@@ -282,50 +260,6 @@ def all_gather_unaligned(data, group=None):
     return data_list
 
 
-def init_distributed_training(cfg):
-    """
-    Initialize variables needed for distributed training.
-    """
-    if cfg.NUM_GPUS <= 1:
-        return
-    num_gpus_per_machine = cfg.NUM_GPUS
-    num_machines = dist.get_world_size() // num_gpus_per_machine
-    for i in range(num_machines):
-        ranks_on_i = list(
-            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
-        )
-        pg = dist.new_group(ranks_on_i)
-        if i == cfg.SHARD_ID:
-            global _LOCAL_PROCESS_GROUP
-            _LOCAL_PROCESS_GROUP = pg
-
-
-def get_local_size() -> int:
-    """
-    Returns:
-        The size of the per-machine process group,
-        i.e. the number of processes per machine.
-    """
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
-
-
-def get_local_rank() -> int:
-    """
-    Returns:
-        The rank of the current process within the local (per-machine) process group.
-    """
-    if not dist.is_available():
-        return 0
-    if not dist.is_initialized():
-        return 0
-    assert _LOCAL_PROCESS_GROUP is not None
-    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
-
-
 class GatherLayer(torch.autograd.Function):
     """Gather tensors from all process, supporting backward propagation."""