forked from nikopueringer/CorridorKey
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwrapper.py
More file actions
280 lines (235 loc) · 10.7 KB
/
wrapper.py
File metadata and controls
280 lines (235 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os
import os.path as osp
import cv2
import random
import logging
import time
from pathlib import Path
from easydict import EasyDict
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import ToTensor, Resize, Compose
from diffusers import AutoencoderKLTemporalDecoder, FlowMatchEulerDiscreteScheduler
from tqdm import tqdm
# Relative imports from the internal gvm package
# Assuming this file is inside gvm_core/
from .gvm.pipelines.pipeline_gvm import GVMPipeline
from .gvm.utils.inference_utils import VideoReader, VideoWriter, ImageSequenceReader, ImageSequenceWriter
from .gvm.models.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
def seed_all(seed: int = 0):
"""Seed all random number generators for reproducibility.
WARNING: This mutates global state — Python's random, numpy's RNG,
and all PyTorch CUDA RNGs. Called from GVMProcessor.__init__.
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def impad_multi(img, multiple=32):
# img: (N, C, H, W)
h, w = img.shape[2], img.shape[3]
target_h = int(np.ceil(h / multiple) * multiple)
target_w = int(np.ceil(w / multiple) * multiple)
pad_top = (target_h - h) // 2
pad_bottom = target_h - h - pad_top
pad_left = (target_w - w) // 2
pad_right = target_w - w - pad_left
# F.pad expects (padding_left, padding_right, padding_top, padding_bottom)
padded = F.pad(img, (pad_left, pad_right, pad_top, pad_bottom), mode='reflect')
return padded, (pad_top, pad_left, pad_bottom, pad_right)
def sequence_collate_fn(examples):
rgb_values = torch.stack([example["image"] for example in examples])
rgb_values = rgb_values.to(memory_format=torch.contiguous_format).float()
rgb_names = [example["filename"] for example in examples]
return {'rgb_values': rgb_values, 'rgb_names': rgb_names}
class GVMProcessor:
def __init__(self,
model_base=None,
unet_base=None,
lora_base=None,
device="cpu",
seed=None):
self.device = torch.device(device)
# Resolve default weights path relative to this file
if model_base is None:
model_base = osp.join(osp.dirname(__file__), "weights")
self.model_base = model_base
self.unet_base = unet_base
self.lora_base = lora_base
if seed is None:
seed = int(time.time())
seed_all(seed)
logging.info(f"Loading GVM models from {model_base}...")
self.vae = AutoencoderKLTemporalDecoder.from_pretrained(model_base, subfolder="vae", torch_dtype=torch.float16)
self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_base, subfolder="scheduler")
unet_folder = unet_base if unet_base is not None else model_base
self.unet = UNetSpatioTemporalConditionModel.from_pretrained(
unet_folder,
subfolder="unet",
class_embed_type=None,
torch_dtype=torch.float16
)
self.pipe = GVMPipeline(vae=self.vae, unet=self.unet, scheduler=self.scheduler)
if lora_base:
# Check if lora_base is None or points to valid path, otherwise try default
if lora_base is None and osp.exists(osp.join(model_base, "unet")):
# Often lora weights are just the unet weights in this codebase based on demo.py usage
pass
elif lora_base:
self.pipe.load_lora_weights(lora_base)
self.pipe = self.pipe.to(self.device, dtype=torch.float16)
logging.info("Models loaded.")
def process_sequence(self, input_path, output_dir,
num_frames_per_batch=8,
denoise_steps=1,
max_frames=None,
decode_chunk_size=8,
num_interp_frames=1,
num_overlap_frames=1,
use_clip_img_emb=False,
noise_type='zeros',
mode='matte',
write_video=True,
direct_output_dir=None):
"""
Process a single video or directory of images.
"""
input_path = Path(input_path)
file_name = input_path.stem
is_video = input_path.suffix.lower() in ['.mp4', '.mkv', '.gif', '.mov', '.avi']
# --- Determine Resolution & Upscaling ---
if is_video:
cap = cv2.VideoCapture(str(input_path))
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
else:
image_files = sorted([f for f in input_path.iterdir() if f.is_file() and f.suffix.lower() in ['.jpg', '.png', '.jpeg', '.exr']])
if not image_files:
logging.warning(f"No images found in {input_path}")
return
# Use cv2 for EXR support if needed
first_img_path = str(image_files[0])
if first_img_path.lower().endswith('.exr'):
# import cv2 # Global import used
if "OPENCV_IO_ENABLE_OPENEXR" not in os.environ:
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
img = cv2.imread(first_img_path, cv2.IMREAD_UNCHANGED)
else:
img = cv2.imread(first_img_path)
if img is not None:
orig_h, orig_w = img.shape[:2]
else:
orig_h, orig_w = 1080, 1920 # Fallback
target_h = orig_h
if target_h < 1024:
scale_ratio = 1024 / target_h
target_h = 1024
# Calculate max resolution / long edge
if orig_h < orig_w: # Landscape
ratio = orig_w / orig_h
new_long = int(1024 * ratio)
else:
ratio = orig_h / orig_w
new_long = int(1024 * ratio)
scale_cap = 1920
if new_long > scale_cap:
new_long = scale_cap
max_res_param = new_long
transform = Compose([
ToTensor(),
Resize(size=1024, max_size=max_res_param, antialias=True)
])
if is_video:
reader = VideoReader(
str(input_path),
max_frames=max_frames,
transform=transform
)
else:
reader = ImageSequenceReader(
str(input_path),
transform=transform
)
# Get upscaled shape from first frame
first_frame = reader[0]
if isinstance(first_frame, dict):
first_frame = first_frame['image']
current_upscaled_shape = list(first_frame.shape[1:]) # H, W
if current_upscaled_shape[0] % 2 != 0: current_upscaled_shape[0] -= 1
if current_upscaled_shape[1] % 2 != 0: current_upscaled_shape[1] -= 1
current_upscaled_shape = tuple(current_upscaled_shape)
# Output preparation
fps = reader.frame_rate if hasattr(reader, 'frame_rate') else 24.0
if direct_output_dir:
# Write directly to this folder
os.makedirs(direct_output_dir, exist_ok=True)
writer_alpha_seq = ImageSequenceWriter(direct_output_dir, extension='png')
writer_alpha = None
if write_video:
# Warning: direct mode might not support video naming nicely without logic
# Let's write video into the directory with fixed name
writer_alpha = VideoWriter(osp.join(direct_output_dir, f"{file_name}_alpha.mp4"), frame_rate=fps)
else:
# Create output directory for this specific file
file_output_dir = osp.join(output_dir, file_name)
os.makedirs(file_output_dir, exist_ok=True)
logging.info(f"Processing {input_path} -> {file_output_dir}")
writer_alpha = VideoWriter(osp.join(file_output_dir, f"{file_name}_alpha.mp4"), frame_rate=fps) if write_video else None
writer_alpha_seq = ImageSequenceWriter(osp.join(file_output_dir, "alpha_seq"), extension='png')
# Dataloader
if is_video:
dataloader = DataLoader(reader, batch_size=num_frames_per_batch)
else:
dataloader = DataLoader(reader, batch_size=num_frames_per_batch, collate_fn=sequence_collate_fn)
upper_bound = 240./255.
lower_bound = 25./ 255.
for batch_id, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Inferencing {file_name}"):
filenames = []
if is_video:
b, _, h, w = batch.shape
for i in range(b):
file_id = batch_id * b + i
filenames.append(f"{file_id:05d}.jpg")
else:
filenames = batch['rgb_names']
batch = batch['rgb_values']
# Pad (Reflective)
batch, pad_info = impad_multi(batch)
# Inference
with torch.no_grad():
pipe_out = self.pipe(
batch.to(self.device, dtype=torch.float16),
num_frames=num_frames_per_batch,
num_overlap_frames=num_overlap_frames,
num_interp_frames=num_interp_frames,
decode_chunk_size=decode_chunk_size,
num_inference_steps=denoise_steps,
mode=mode,
use_clip_img_emb=use_clip_img_emb,
noise_type=noise_type,
ensemble_size=1,
)
image = pipe_out.image
alpha = pipe_out.alpha
# Crop padding
out_h, out_w = image.shape[2:]
pad_t, pad_l, pad_b, pad_r = pad_info
end_h = out_h - pad_b
end_w = out_w - pad_r
image = image[:, :, pad_t:end_h, pad_l:end_w]
alpha = alpha[:, :, pad_t:end_h, pad_l:end_w]
# Resize to ensure exact match if there's any discrepancy
alpha = F.interpolate(alpha, current_upscaled_shape, mode='bilinear')
# Threshold
alpha[alpha>=upper_bound] = 1.0
alpha[alpha<=lower_bound] = 0.0
if writer_alpha: writer_alpha.write(alpha)
writer_alpha_seq.write(alpha, filenames=filenames)
if writer_alpha: writer_alpha.close()
writer_alpha_seq.close()
logging.info(f"Finished {file_name}")