use all pos and neg samples for training relpn

jwyang · jwyang · commit 77898ac1e9c9 · 2020-01-13T23:21:24.000-05:00
diff --git a/lib/scene_parser/parser.py b/lib/scene_parser/parser.py
@@ -111,7 +111,12 @@ def forward(self, images, targets=None):
         if self.training and targets is None:
             raise ValueError("In training mode, targets should be passed")
         images = to_image_list(images)
-        features = self.backbone(images.tensors)
+        if self.training:
+            with torch.no_grad():
+                features = self.backbone(images.tensors)
+        else:
+            features = self.backbone(images.tensors)
+
         proposals, proposal_losses = self.rpn(images, features, targets)
         scene_parser_losses = {}
         if self.roi_heads:
diff --git a/lib/scene_parser/rcnn/modeling/relation_heads/relation_heads.py b/lib/scene_parser/rcnn/modeling/relation_heads/relation_heads.py
@@ -115,10 +115,11 @@ def forward(self, features, proposals, targets=None):
             else:
                 proposal_pairs = self.loss_evaluator.subsample(proposals, targets)
         else:
-            if self.cfg.MODEL.USE_RELPN:
-                proposal_pairs, _ = self.relpn(proposals)
-            else:
-                proposal_pairs = self.loss_evaluator.subsample(proposals)
+            with torch.no_grad():
+                if self.cfg.MODEL.USE_RELPN:
+                    proposal_pairs, relnesses = self.relpn(proposals)
+                else:
+                    proposal_pairs = self.loss_evaluator.subsample(proposals)
 
         if self.cfg.MODEL.USE_FREQ_PRIOR:
             """
@@ -160,6 +161,11 @@ def forward(self, features, proposals, targets=None):
             #         proposal.add_field("scores", obj_score)
             #         proposal.add_field("labels", obj_label)
             result = self.post_processor((pred_class_logits), proposal_pairs, use_freq_prior=self.cfg.MODEL.USE_FREQ_PRIOR)
+
+            # if self.cfg.MODEL.USE_RELPN:
+            #     for res, relness in zip(result, relnesses):
+            #         res.add_field("scores", res.get_field("scores") * relness.view(-1, 1))
+            
             return x, result, {}
 
         loss_obj_classifier = 0
diff --git a/lib/scene_parser/rcnn/modeling/relation_heads/relpn/relationshipness.py b/lib/scene_parser/rcnn/modeling/relation_heads/relpn/relationshipness.py
@@ -1,12 +1,14 @@
 import torch
 import torch.nn as nn
+from .utils import box_pos_encoder
 
 class Relationshipness(nn.Module):
     """
     compute relationshipness between subjects and objects
     """
     def __init__(self, dim, pos_encoding=False):
         super(Relationshipness, self).__init__()
+
         self.subj_proj = nn.Sequential(
             nn.Linear(dim, 64),
             nn.ReLU(True),
@@ -19,9 +21,30 @@ def __init__(self, dim, pos_encoding=False):
             nn.Linear(64, 64)
         )
 
-    def forward(self, x, bbox=None):
+        self.pos_encoding = False
+        if pos_encoding:
+            self.pos_encoding = True
+            self.sub_pos_encoder = nn.Sequential(
+                nn.Linear(6, 64),
+                nn.ReLU(True),
+                nn.Linear(64, 64)
+            )
+
+            self.obj_pos_encoder = nn.Sequential(
+                nn.Linear(6, 64),
+                nn.ReLU(True),
+                nn.Linear(64, 64)
+            )
+
+    def forward(self, x, bbox=None, imsize=None):
         x_subj = self.subj_proj(x) # k x 64
         x_obj = self.obj_prof(x)   # k x 64
         scores = torch.mm(x_subj, x_obj.t()) # k x k
+        if self.pos_encoding:
+            pos = box_pos_encoder(bbox, imsize[0], imsize[1])
+            pos_subj = self.sub_pos_encoder(pos)
+            pos_obj = self.obj_pos_encoder(pos)
+            pos_scores = torch.mm(pos_subj, pos_obj.t()) # k x k
+            scores = scores + pos_scores
         relness = torch.sigmoid(scores)      # k x k
         return relness
diff --git a/lib/scene_parser/rcnn/modeling/relation_heads/relpn/relpn.py b/lib/scene_parser/rcnn/modeling/relation_heads/relpn/relpn.py
@@ -31,7 +31,7 @@ def __init__(
         self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg
         self.use_matched_pairs_only = use_matched_pairs_only
         self.minimal_matched_pairs = minimal_matched_pairs
-        self.relationshipness = Relationshipness(self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES)
+        self.relationshipness = Relationshipness(self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES, pos_encoding=True)
 
     def match_targets_to_proposals(self, proposal, target):
         match_quality_matrix = boxlist_iou(target, proposal)
@@ -146,20 +146,22 @@ def _relpnsample_train(self, proposals, targets):
             enumerate(zip(proposals, sampled_pos_inds, sampled_neg_inds)):
             obj_logits = proposals_per_image.get_field('logits')
             obj_bboxes = proposals_per_image.bbox
-            relness = self.relationshipness(obj_logits, obj_bboxes)
+            relness = self.relationshipness(obj_logits, obj_bboxes, proposals_per_image.size)
             nondiag = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1)
             relness = relness.view(-1)[nondiag.nonzero()]
             relness_sorted, order = torch.sort(relness.view(-1), descending=True)
             img_sampled_inds = order[:self.cfg.MODEL.ROI_RELATION_HEAD.BATCH_SIZE_PER_IMAGE].view(-1)
             proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
             proposal_pairs[img_idx] = proposal_pairs_per_image
 
-            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
-            relness = relness[img_sampled_inds]
-            pos_labels = torch.ones(len(pos_inds_img.nonzero()))
-            neg_labels = torch.zeros(len(neg_inds_img.nonzero()))
-            rellabels = torch.cat((pos_labels, neg_labels), 0).view(-1, 1)
-            losses += F.binary_cross_entropy(relness, rellabels.to(relness.device))
+            # import pdb; pdb.set_trace()
+            # img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            # relness = relness[img_sampled_inds]
+            # pos_labels = torch.ones(len(pos_inds_img.nonzero()))
+            # neg_labels = torch.zeros(len(neg_inds_img.nonzero()))
+            # rellabels = torch.cat((pos_labels, neg_labels), 0).view(-1, 1)
+            # losses += F.binary_cross_entropy(relness, rellabels.to(relness.device))
+            losses += F.binary_cross_entropy(relness, (labels[img_idx] > 0).view(-1, 1).float())
 
         # distributed sampled proposals, that were obtained on all feature maps
         # concatenated via the fg_bg_sampler, into individual feature map levels
@@ -174,42 +176,70 @@ def _relpnsample_train(self, proposals, targets):
 
         return proposal_pairs, losses
 
+    def _fullsample_test(self, proposals):
+        """
+        This method get all subject-object pairs, and return the proposals.
+        Note: this function keeps a state.
+
+        Arguments:
+            proposals (list[BoxList])
+        """
+        proposal_pairs = []
+        for i, proposals_per_image in enumerate(proposals):
+            box_subj = proposals_per_image.bbox
+            box_obj = proposals_per_image.bbox
+
+            box_subj = box_subj.unsqueeze(1).repeat(1, box_subj.shape[0], 1)
+            box_obj = box_obj.unsqueeze(0).repeat(box_obj.shape[0], 1, 1)
+            proposal_box_pairs = torch.cat((box_subj.view(-1, 4), box_obj.view(-1, 4)), 1)
+
+            idx_subj = torch.arange(box_subj.shape[0]).view(-1, 1, 1).repeat(1, box_obj.shape[0], 1).to(proposals_per_image.bbox.device)
+            idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals_per_image.bbox.device)
+            proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
+
+            keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
+
+            # if we filter non overlap bounding boxes
+            if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
+                ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
+                ious = ious[keep_idx]
+                keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
+            proposal_idx_pairs = proposal_idx_pairs[keep_idx]
+            proposal_box_pairs = proposal_box_pairs[keep_idx]
+            proposal_pairs_per_image = BoxPairList(proposal_box_pairs, proposals_per_image.size, proposals_per_image.mode)
+            proposal_pairs_per_image.add_field("idx_pairs", proposal_idx_pairs)
+
+            proposal_pairs.append(proposal_pairs_per_image)
+        return proposal_pairs
+
     def _relpnsample_test(self, proposals):
         """
         perform relpn based sampling during testing
         """
-        labels, proposal_pairs = self.prepare_targets(proposals, targets)
+        proposals[0] = proposals[0]
+        proposal_pairs = self._fullsample_test(proposals)
         proposal_pairs = list(proposal_pairs)
 
-        import pdb; pdb.set_trace()
-
-        losses = 0
-        for img_idx, (proposals_per_image) in \
-            enumerate(zip(proposals)):
+        relnesses = []
+        for img_idx, proposals_per_image in enumerate(proposals):
             obj_logits = proposals_per_image.get_field('logits')
             obj_bboxes = proposals_per_image.bbox
-            relness = self.relationshipness(obj_logits, obj_bboxes)
+            relness = self.relationshipness(obj_logits, obj_bboxes, proposals_per_image.size)
             nondiag = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1)
             relness = relness.view(-1)[nondiag.nonzero()]
-            relness_sorted, order = torch.sort(relness, descending=True)
-            img_sampled_inds = order[:self.cfg.MODEL.ROI_RELATION_HEAD.BATCH_SIZE_PER_IMAGE].view(-1)
+            relness_sorted, order = torch.sort(relness.view(-1), descending=True)
+            img_sampled_inds = order[:196].view(-1)
+            relness = relness_sorted[:196].view(-1)
             proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
             proposal_pairs[img_idx] = proposal_pairs_per_image
+            relnesses.append(relness)
 
-        # distributed sampled proposals, that were obtained on all feature maps
-        # concatenated via the fg_bg_sampler, into individual feature map levels
-        # for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
-        #     zip(sampled_pos_inds, sampled_neg_inds)
-        # ):
-        #     img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
-        #     proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
-        #     proposal_pairs[img_idx] = proposal_pairs_per_image
-
+        # self.cfg.MODEL.ROI_RELATION_HEAD.BATCH_SIZE_PER_IMAGE
         self._proposal_pairs = proposal_pairs
 
-        return proposal_pairs, {}
+        return proposal_pairs, relnesses
 
-    def forward(self, proposals, targets):
+    def forward(self, proposals, targets=None):
         """
         This method performs the positive/negative sampling, and return
         the sampled proposals.
diff --git a/lib/scene_parser/rcnn/modeling/relation_heads/relpn/utils.py b/lib/scene_parser/rcnn/modeling/relation_heads/relpn/utils.py
@@ -0,0 +1,16 @@
+import torch
+
+def box_pos_encoder(bboxes, width, height):
+    """
+    bounding box encoding
+    """
+    bboxes_enc = bboxes.clone()
+
+    dim0 = bboxes_enc[:, 0] / width
+    dim1 = bboxes_enc[:, 1] / height
+    dim2 = bboxes_enc[:, 2] / width
+    dim3 = bboxes_enc[:, 3] / height
+    dim4 = (bboxes_enc[:, 2] - bboxes_enc[:, 0]) * (bboxes_enc[:, 3] - bboxes_enc[:, 1]) / height / width
+    dim5 = (bboxes_enc[:, 3] - bboxes_enc[:, 1]) / (bboxes_enc[:, 2] - bboxes_enc[:, 0] + 1)
+
+    return torch.stack((dim0,dim1,dim2,dim3,dim4,dim5), 1)