Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 206 additions & 20 deletions armory/art_experimental/attacks/carla_adversarial_texture.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import numpy as np
import torch
from typing import Optional

from art.attacks.evasion import AdversarialTexturePyTorch

Expand All @@ -9,8 +11,187 @@ class AdversarialPhysicalTexture(AdversarialTexturePyTorch):
"""

def __init__(self, estimator, **kwargs):
self.attack_kwargs = kwargs
super(AdversarialTexturePyTorch, self).__init__(estimator=estimator)
# use dummy patch height/width for initialization
super().__init__(estimator=estimator, patch_height=1, patch_width=1, **kwargs)

# Copied from ART 1.10.0 and modified to accommodate dynamic shadowing on patch
def _apply_texture(
self,
videos: "torch.Tensor",
patch: "torch.Tensor",
foreground: Optional["torch.Tensor"],
patch_points: Optional[np.ndarray],
) -> "torch.Tensor":
"""
Apply texture over background and overlay foreground.
:param videos: Video samples.
:param patch: Patch to apply.
:param foreground: Foreground mask.
:param patch_points: Array of shape (nb_frames, 4, 2) containing four pairs of integers (height, width)
corresponding to the coordinates of the four corners top-left, top-right, bottom-right,
bottom-left of the transformed image in the coordinate-system of the original image.
:return: Patched videos.
"""
import torch # lgtm [py/repeated-import]
import torchvision

nb_samples = videos.shape[0]
nb_frames = videos.shape[1]
frame_height = videos.shape[2]
frame_width = videos.shape[3]

image_mask = self._get_patch_mask(nb_samples=nb_samples)
image_mask = image_mask.float()

patch = patch.float()
padded_patch = torch.stack([patch] * nb_samples)

if patch_points is None:
pad_h_before = self.x_min
pad_h_after = int(
videos.shape[self.i_h + 1]
- pad_h_before
- image_mask.shape[self.i_h_patch + 1]
)

pad_w_before = self.y_min
pad_w_after = int(
videos.shape[self.i_w + 1]
- pad_w_before
- image_mask.shape[self.i_w_patch + 1]
)

image_mask = image_mask.permute(0, 3, 1, 2)

image_mask = torchvision.transforms.functional.pad(
img=image_mask,
padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after],
fill=0,
padding_mode="constant",
)

image_mask = image_mask.permute(0, 2, 3, 1)

image_mask = torch.unsqueeze(image_mask, dim=1)
image_mask = torch.repeat_interleave(image_mask, dim=1, repeats=nb_frames)
image_mask = image_mask.float()

padded_patch = padded_patch.permute(0, 3, 1, 2)

padded_patch = torchvision.transforms.functional.pad(
img=padded_patch,
padding=[pad_w_before, pad_h_before, pad_w_after, pad_h_after],
fill=0,
padding_mode="constant",
)

padded_patch = padded_patch.permute(0, 2, 3, 1)

padded_patch = torch.unsqueeze(padded_patch, dim=1)
padded_patch = torch.repeat_interleave(
padded_patch, dim=1, repeats=nb_frames
)

padded_patch = padded_patch.float()

else:

startpoints = [
[0, 0],
[frame_width, 0],
[frame_width, frame_height],
[0, frame_height],
]
endpoints = np.zeros_like(patch_points)
endpoints[:, :, 0] = patch_points[:, :, 1]
endpoints[:, :, 1] = patch_points[:, :, 0]

image_mask = image_mask.permute(0, 3, 1, 2)

image_mask = torchvision.transforms.functional.resize(
img=image_mask,
size=[int(videos.shape[2]), int(videos.shape[3])],
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
)

image_mask_list = []

for i_frame in range(nb_frames):

image_mask_i = torchvision.transforms.functional.perspective(
img=image_mask,
startpoints=startpoints,
endpoints=endpoints[i_frame],
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
fill=0,
)

image_mask_i = image_mask_i.permute(0, 2, 3, 1)

image_mask_list.append(image_mask_i)

image_mask = torch.stack(image_mask_list, dim=1)
image_mask = image_mask.float()

padded_patch = padded_patch.permute(0, 3, 1, 2)

padded_patch = torchvision.transforms.functional.resize(
img=padded_patch,
size=[int(videos.shape[2]), int(videos.shape[3])],
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
)

padded_patch_list = []

for i_frame in range(nb_frames):
padded_patch_i = torchvision.transforms.functional.perspective(
img=padded_patch,
startpoints=startpoints,
endpoints=endpoints[i_frame],
interpolation=torchvision.transforms.InterpolationMode.BILINEAR,
fill=0,
)

padded_patch_i = padded_patch_i.permute(0, 2, 3, 1)

padded_patch_list.append(padded_patch_i)

padded_patch = torch.stack(padded_patch_list, dim=1)
padded_patch = padded_patch.float()

inverted_mask = (
torch.from_numpy(np.ones(shape=image_mask.shape, dtype=np.float32)).to(
self.estimator.device
)
- image_mask
)

# Adjust green screen brightness
v_avg = (
0.5647 # average V value (in HSV) for the green screen, which is #00903a
)
green_screen = videos * image_mask
values, _ = torch.max(green_screen, dim=4, keepdim=True)
values_ratio = values / v_avg
values_ratio = torch.repeat_interleave(values_ratio, dim=4, repeats=3)

if foreground is not None:
combined = (
videos * inverted_mask
+ padded_patch * values_ratio * image_mask
- padded_patch * values_ratio * ~foreground.bool()
+ videos * ~foreground.bool() * image_mask
)

combined = torch.clamp(
combined,
min=self.estimator.clip_values[0],
max=self.estimator.clip_values[1],
)
else:
combined = videos * inverted_mask + padded_patch * values_ratio * image_mask

return combined

def generate(self, x, y, y_patch_metadata=None, **kwargs):
"""
Expand All @@ -25,16 +206,6 @@ def generate(self, x, y, y_patch_metadata=None, **kwargs):
- cc_ground_truth: ground truth color information stored as np.ndarray with shape (24,3)
- cc_scene: scene color information stored as np.ndarray with shape (24,3)
- masks: binarized masks of the patch, where masks[n,x,y] == 1 means patch pixel in frame n and at position (x,y)
:Keyword Arguments:
* *shuffle* (``np.ndarray``) --
Shuffle order of samples, labels, initial boxes, and foregrounds for texture generation.
* *y_init* (``np.ndarray``) --
Initial boxes around object to be tracked of shape (nb_samples, 4) with second dimension representing
[x1, y1, x2, y2] with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
* *foreground* (``np.ndarray``) --
Foreground masks of shape NFHWC of boolean values with False/0.0 representing foreground, preventing
updates to the texture, and True/1.0 for background, allowing updates to the texture.
:return: An array with adversarial patch and an array of the patch mask.
"""

if x.shape[0] > 1:
Expand All @@ -48,24 +219,39 @@ def generate(self, x, y, y_patch_metadata=None, **kwargs):
x_min = int(np.min(gs_coords[:, 1]))
y_min = int(np.min(gs_coords[:, 0]))

attack = AdversarialTexturePyTorch(
self.estimator,
patch_height=patch_height,
patch_width=patch_width,
x_min=x_min,
y_min=y_min,
**self.attack_kwargs
self.patch_height = patch_height
self.patch_width = patch_width
self.x_min = x_min
self.y_min = y_min

# reinitialize patch
self.patch_shape = (patch_height, patch_width, 3)
mean_value = (
self.estimator.clip_values[1] - self.estimator.clip_values[0]
) / 2.0 + self.estimator.clip_values[0]
self._initial_value = np.ones(self.patch_shape) * mean_value
self._patch = torch.tensor(
self._initial_value, requires_grad=True, device=self.estimator.device
)

# this masked to embed patch into the background in the event of occlusion
foreground = y_patch_metadata[0]["masks"]
foreground = np.array([foreground])

# create patch points indicating locations of the four corners of the patch in each frame
if gs_coords.ndim == 2: # same location for all frames
patch_points = np.tile(gs_coords[:, ::-1], (x.shape[1], 1, 1))
else:
patch_points = gs_coords[:, :, ::-1]

generate_kwargs = {
"y_init": y[0]["boxes"][0:1],
"foreground": foreground,
"shuffle": kwargs.get("shuffle", False),
"patch_points": patch_points,
}
generate_kwargs = {**generate_kwargs, **kwargs}
attacked_video = attack.generate(x, y, **generate_kwargs)

attacked_video = super().generate(x, y, **generate_kwargs)

return attacked_video