diff --git a/armory/art_experimental/attacks/carla_adversarial_texture.py b/armory/art_experimental/attacks/carla_adversarial_texture.py index e9464d4a1..ad9031a15 100755 --- a/armory/art_experimental/attacks/carla_adversarial_texture.py +++ b/armory/art_experimental/attacks/carla_adversarial_texture.py @@ -1,4 +1,6 @@ import numpy as np +import torch +from typing import Optional from art.attacks.evasion import AdversarialTexturePyTorch @@ -9,8 +11,187 @@ class AdversarialPhysicalTexture(AdversarialTexturePyTorch): """ def __init__(self, estimator, **kwargs): - self.attack_kwargs = kwargs - super(AdversarialTexturePyTorch, self).__init__(estimator=estimator) + # use dummy patch height/width for initialization + super().__init__(estimator=estimator, patch_height=1, patch_width=1, **kwargs) + + # Copied from ART 1.10.0 and modified to accommodate dynamic shadowing on patch + def _apply_texture( + self, + videos: "torch.Tensor", + patch: "torch.Tensor", + foreground: Optional["torch.Tensor"], + patch_points: Optional[np.ndarray], + ) -> "torch.Tensor": + """ + Apply texture over background and overlay foreground. + :param videos: Video samples. + :param patch: Patch to apply. + :param foreground: Foreground mask. + :param patch_points: Array of shape (nb_frames, 4, 2) containing four pairs of integers (height, width) + corresponding to the coordinates of the four corners top-left, top-right, bottom-right, + bottom-left of the transformed image in the coordinate-system of the original image. + :return: Patched videos. + """ + import torch # lgtm [py/repeated-import] + import torchvision + + nb_samples = videos.shape[0] + nb_frames = videos.shape[1] + frame_height = videos.shape[2] + frame_width = videos.shape[3] + + image_mask = self._get_patch_mask(nb_samples=nb_samples) + image_mask = image_mask.float() + + patch = patch.float() + padded_patch = torch.stack([patch] * nb_samples) + + if patch_points is None: + pad_h_before = self.x_min + pad_h_after = int( + videos.shape[self.i_h + 1] + - pad_h_before + - image_mask.shape[self.i_h_patch + 1] + ) + + pad_w_before = self.y_min + pad_w_after = int( + videos.shape[self.i_w + 1] + - pad_w_before + - image_mask.shape[self.i_w_patch + 1] + ) + + image_mask = image_mask.permute(0, 3, 1, 2) + + image_mask = torchvision.transforms.functional.pad( + img=image_mask, + padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after], + fill=0, + padding_mode="constant", + ) + + image_mask = image_mask.permute(0, 2, 3, 1) + + image_mask = torch.unsqueeze(image_mask, dim=1) + image_mask = torch.repeat_interleave(image_mask, dim=1, repeats=nb_frames) + image_mask = image_mask.float() + + padded_patch = padded_patch.permute(0, 3, 1, 2) + + padded_patch = torchvision.transforms.functional.pad( + img=padded_patch, + padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after], + fill=0, + padding_mode="constant", + ) + + padded_patch = padded_patch.permute(0, 2, 3, 1) + + padded_patch = torch.unsqueeze(padded_patch, dim=1) + padded_patch = torch.repeat_interleave( + padded_patch, dim=1, repeats=nb_frames + ) + + padded_patch = padded_patch.float() + + else: + + startpoints = [ + [0, 0], + [frame_width, 0], + [frame_width, frame_height], + [0, frame_height], + ] + endpoints = np.zeros_like(patch_points) + endpoints[:, :, 0] = patch_points[:, :, 1] + endpoints[:, :, 1] = patch_points[:, :, 0] + + image_mask = image_mask.permute(0, 3, 1, 2) + + image_mask = torchvision.transforms.functional.resize( + img=image_mask, + size=[int(videos.shape[2]), int(videos.shape[3])], + interpolation=torchvision.transforms.InterpolationMode.BILINEAR, + ) + + image_mask_list = [] + + for i_frame in range(nb_frames): + + image_mask_i = torchvision.transforms.functional.perspective( + img=image_mask, + startpoints=startpoints, + endpoints=endpoints[i_frame], + interpolation=torchvision.transforms.InterpolationMode.BILINEAR, + fill=0, + ) + + image_mask_i = image_mask_i.permute(0, 2, 3, 1) + + image_mask_list.append(image_mask_i) + + image_mask = torch.stack(image_mask_list, dim=1) + image_mask = image_mask.float() + + padded_patch = padded_patch.permute(0, 3, 1, 2) + + padded_patch = torchvision.transforms.functional.resize( + img=padded_patch, + size=[int(videos.shape[2]), int(videos.shape[3])], + interpolation=torchvision.transforms.InterpolationMode.BILINEAR, + ) + + padded_patch_list = [] + + for i_frame in range(nb_frames): + padded_patch_i = torchvision.transforms.functional.perspective( + img=padded_patch, + startpoints=startpoints, + endpoints=endpoints[i_frame], + interpolation=torchvision.transforms.InterpolationMode.BILINEAR, + fill=0, + ) + + padded_patch_i = padded_patch_i.permute(0, 2, 3, 1) + + padded_patch_list.append(padded_patch_i) + + padded_patch = torch.stack(padded_patch_list, dim=1) + padded_patch = padded_patch.float() + + inverted_mask = ( + torch.from_numpy(np.ones(shape=image_mask.shape, dtype=np.float32)).to( + self.estimator.device + ) + - image_mask + ) + + # Adjust green screen brightness + v_avg = ( + 0.5647 # average V value (in HSV) for the green screen, which is #00903a + ) + green_screen = videos * image_mask + values, _ = torch.max(green_screen, dim=4, keepdim=True) + values_ratio = values / v_avg + values_ratio = torch.repeat_interleave(values_ratio, dim=4, repeats=3) + + if foreground is not None: + combined = ( + videos * inverted_mask + + padded_patch * values_ratio * image_mask + - padded_patch * values_ratio * ~foreground.bool() + + videos * ~foreground.bool() * image_mask + ) + + combined = torch.clamp( + combined, + min=self.estimator.clip_values[0], + max=self.estimator.clip_values[1], + ) + else: + combined = videos * inverted_mask + padded_patch * values_ratio * image_mask + + return combined def generate(self, x, y, y_patch_metadata=None, **kwargs): """ @@ -25,16 +206,6 @@ def generate(self, x, y, y_patch_metadata=None, **kwargs): - cc_ground_truth: ground truth color information stored as np.ndarray with shape (24,3) - cc_scene: scene color information stored as np.ndarray with shape (24,3) - masks: binarized masks of the patch, where masks[n,x,y] == 1 means patch pixel in frame n and at position (x,y) - :Keyword Arguments: - * *shuffle* (``np.ndarray``) -- - Shuffle order of samples, labels, initial boxes, and foregrounds for texture generation. - * *y_init* (``np.ndarray``) -- - Initial boxes around object to be tracked of shape (nb_samples, 4) with second dimension representing - [x1, y1, x2, y2] with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H. - * *foreground* (``np.ndarray``) -- - Foreground masks of shape NFHWC of boolean values with False/0.0 representing foreground, preventing - updates to the texture, and True/1.0 for background, allowing updates to the texture. - :return: An array with adversarial patch and an array of the patch mask. """ if x.shape[0] > 1: @@ -48,24 +219,39 @@ def generate(self, x, y, y_patch_metadata=None, **kwargs): x_min = int(np.min(gs_coords[:, 1])) y_min = int(np.min(gs_coords[:, 0])) - attack = AdversarialTexturePyTorch( - self.estimator, - patch_height=patch_height, - patch_width=patch_width, - x_min=x_min, - y_min=y_min, - **self.attack_kwargs + self.patch_height = patch_height + self.patch_width = patch_width + self.x_min = x_min + self.y_min = y_min + + # reinitialize patch + self.patch_shape = (patch_height, patch_width, 3) + mean_value = ( + self.estimator.clip_values[1] - self.estimator.clip_values[0] + ) / 2.0 + self.estimator.clip_values[0] + self._initial_value = np.ones(self.patch_shape) * mean_value + self._patch = torch.tensor( + self._initial_value, requires_grad=True, device=self.estimator.device ) # this masked to embed patch into the background in the event of occlusion foreground = y_patch_metadata[0]["masks"] foreground = np.array([foreground]) + # create patch points indicating locations of the four corners of the patch in each frame + if gs_coords.ndim == 2: # same location for all frames + patch_points = np.tile(gs_coords[:, ::-1], (x.shape[1], 1, 1)) + else: + patch_points = gs_coords[:, :, ::-1] + generate_kwargs = { "y_init": y[0]["boxes"][0:1], "foreground": foreground, "shuffle": kwargs.get("shuffle", False), + "patch_points": patch_points, } generate_kwargs = {**generate_kwargs, **kwargs} - attacked_video = attack.generate(x, y, **generate_kwargs) + + attacked_video = super().generate(x, y, **generate_kwargs) + return attacked_video