debug Plain-DETR

yjh0410 · yjh0410 · commit 93659a2c8978 · 2024-02-15T18:31:56.000+08:00
diff --git a/config/fcos_config.py b/config/fcos_config.py
@@ -78,6 +78,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -156,6 +157,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -234,6 +236,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -312,6 +315,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -390,6 +394,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -468,6 +473,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -547,6 +553,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -625,6 +632,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -703,6 +711,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -781,6 +790,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
diff --git a/config/plain_detr_config.py b/config/plain_detr_config.py
@@ -19,6 +19,7 @@
         'en_ffn_dim': 2048,
         'en_dropout': 0.1,
         'en_act': 'gelu',
+        'en_pre_norm': True,
         # Transformer Decoder
         'transformer': 'plain_detr_transformer',
         'de_num_heads': 8,
@@ -81,6 +82,7 @@
         ## Transforms
         'detr_style': True,
         'trans_config': None,
+        'box_format': 'xywh',
         'normalize_coords': False,
     },
 
diff --git a/config/retinanet_config.py b/config/retinanet_config.py
@@ -82,6 +82,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -164,6 +165,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -246,6 +248,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -328,6 +331,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -410,6 +414,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -492,6 +497,7 @@
             {'name': 'RandomHFlip'},
             {'name': 'RandomResize'},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
diff --git a/config/yolof_config.py b/config/yolof_config.py
@@ -83,6 +83,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -165,6 +166,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -247,6 +249,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -330,6 +333,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -412,6 +416,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -497,6 +502,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -579,6 +585,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -662,6 +669,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
@@ -744,6 +752,7 @@
             {'name': 'RandomResize'},
             {'name': 'RandomShift', 'max_shift': 32},
         ],
+        'box_format': 'xyxy',
         'normalize_coords': False,
     },
 
diff --git a/datasets/coco.py b/datasets/coco.py
@@ -133,6 +133,7 @@ def build_coco(args, transform=None, is_train=False):
             {'name': 'RandomHFlip'},
             {'name': 'RandomShift', 'max_shift': 100}
         ],
+        'box_format': 'xywh',
         'normalize_coords': False,
     }
 
@@ -155,6 +156,10 @@ def build_coco(args, transform=None, is_train=False):
             if cfg['normalize_coords']:
                 box[..., [0, 2]] *= orig_w
                 box[..., [1, 3]] *= orig_h
+            if cfg['box_format'] == 'xywh':
+                box_x1y1 = box[..., :2] - box[..., 2:] * 0.5
+                box_x2y2 = box[..., :2] + box[..., 2:] * 0.5
+                box = torch.cat([box_x1y1, box_x2y2], dim=-1)
             # get box target
             x1, y1, x2, y2 = box.long()
             # get class label
diff --git a/datasets/transforms.py b/datasets/transforms.py
@@ -250,6 +250,26 @@ def __call__(self, image, target=None):
                 target["boxes"] = boxes
         return image, target
 
+class ConvertBoxFormat(object):
+    def __init__(self, box_format="xyxy"):
+        self.box_format = box_format
+
+    def __call__(self, image, target=None):
+        if self.box_format == "xyxy" or target is None:
+            pass
+        elif self.box_format == "xywh":
+            target = target.copy()
+            if "boxes" in target:
+                boxes_xyxy = target["boxes"]
+                boxes_xywh = torch.zeros_like(boxes_xyxy)
+                boxes_xywh[..., :2] = (boxes_xyxy[..., :2] + boxes_xyxy[..., 2:]) * 0.5   # cxcy
+                boxes_xywh[..., 2:] = boxes_xyxy[..., 2:] - boxes_xyxy[..., :2]           # bwbh
+                target["boxes"] = boxes_xywh
+        else:
+            raise NotImplementedError("Unknown box format: {}".format(self.box_format))
+
+        return image, target
+
 class Compose(object):
     def __init__(self, transforms):
         self.transforms = transforms
@@ -287,7 +307,8 @@ def build_transform(cfg=None, is_train=False):
                     transforms.append(RandomShift(max_shift=t['max_shift']))
             transforms.extend([
                 ToTensor(),
-                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords'])
+                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+                ConvertBoxFormat(cfg['box_format'])
             ])
         # build transform for DETR-style detector
         else:
@@ -302,15 +323,17 @@ def build_transform(cfg=None, is_train=False):
                     ])
                 ),
                 ToTensor(),
-                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords'])
+                Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+                ConvertBoxFormat(cfg['box_format'])
             ]
 
     # ---------------- Transform for Evaluating ----------------
     else:
         transforms = [
             RandomResize([cfg['test_min_size']], max_size=cfg['test_max_size']),
             ToTensor(),
-            Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords'])
+            Normalize(cfg['pixel_mean'], cfg['pixel_std'], cfg['normalize_coords']),
+            ConvertBoxFormat(cfg['box_format'])
         ]
     
     return Compose(transforms)
diff --git a/engine.py b/engine.py
@@ -54,7 +54,7 @@ def train_one_epoch(cfg,
 
         # Visualize train targets
         if vis_target:
-            vis_data(images, targets, masks, class_labels, cfg['normalize_coords'])
+            vis_data(images, targets, masks, class_labels, cfg['normalize_coords'], cfg['box_format'])
 
         # Inference
         outputs = model(images, masks)
diff --git a/models/basic/transformer.py b/models/basic/transformer.py
@@ -72,11 +72,12 @@ def build_transformer(cfg, return_intermediate=False):
 # ----------------- Transformer Encoder modules -----------------
 class TransformerEncoderLayer(nn.Module):
     def __init__(self,
-                 d_model         :int   = 256,
-                 num_heads       :int   = 8,
-                 ffn_dim         :int   = 1024,
-                 dropout         :float = 0.1,
-                 act_type        :str   = "relu",
+                 d_model   :int   = 256,
+                 num_heads :int   = 8,
+                 ffn_dim   :int   = 1024,
+                 dropout   :float = 0.1,
+                 act_type  :str   = "relu",
+                 pre_norm  :bool = False,
                  ):
         super().__init__()
         # ----------- Basic parameters -----------
@@ -85,6 +86,7 @@ def __init__(self,
         self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
+        self.pre_norm = pre_norm
         # ----------- Basic parameters -----------
         # Multi-head Self-Attn
         self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
@@ -97,7 +99,27 @@ def __init__(self,
     def with_pos_embed(self, tensor, pos):
         return tensor if pos is None else tensor + pos
 
-    def forward(self, src, pos_embed):
+    def forward_pre_norm(self, src, pos_embed):
+        """
+        Input:
+            src:       [torch.Tensor] -> [B, N, C]
+            pos_embed: [torch.Tensor] -> [B, N, C]
+        Output:
+            src:       [torch.Tensor] -> [B, N, C]
+        """
+        src = self.norm(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+
+        # -------------- MHSA --------------
+        src2 = self.self_attn(q, k, value=src)[0]
+        src = src + self.dropout(src2)
+
+        # -------------- FFN --------------
+        src = self.ffn(src)
+        
+        return src
+
+    def forward_post_norm(self, src, pos_embed):
         """
         Input:
             src:       [torch.Tensor] -> [B, N, C]
@@ -117,15 +139,22 @@ def forward(self, src, pos_embed):
         
         return src
 
+    def forward(self, src, pos_embed):
+        if self.pre_norm:
+            return self.forward_pre_norm(src, pos_embed)
+        else:
+            return self.forward_post_norm(src, pos_embed)
+
 class TransformerEncoder(nn.Module):
     def __init__(self,
                  d_model        :int   = 256,
                  num_heads      :int   = 8,
                  num_layers     :int   = 1,
                  ffn_dim        :int   = 1024,
-                 pe_temperature : float = 10000.,
+                 pe_temperature :float = 10000.,
                  dropout        :float = 0.1,
                  act_type       :str   = "relu",
+                 pre_norm       :bool  = False,
                  ):
         super().__init__()
         # ----------- Basic parameters -----------
@@ -135,11 +164,12 @@ def __init__(self,
         self.ffn_dim = ffn_dim
         self.dropout = dropout
         self.act_type = act_type
+        self.pre_norm = pre_norm
         self.pe_temperature = pe_temperature
         self.pos_embed = None
         # ----------- Basic parameters -----------
         self.encoder_layers = get_clones(
-            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type), num_layers)
+            TransformerEncoderLayer(d_model, num_heads, ffn_dim, dropout, act_type, pre_norm), num_layers)
 
     def build_2d_sincos_position_embedding(self, device, w, h, embed_dim=256, temperature=10000.):
         assert embed_dim % 4 == 0, \
diff --git a/models/detectors/plain_detr/criterion.py b/models/detectors/plain_detr/criterion.py
diff --git a/models/detectors/plain_detr/plain_detr.py b/models/detectors/plain_detr/plain_detr.py
diff --git a/utils/vis_tools.py b/utils/vis_tools.py