-
Notifications
You must be signed in to change notification settings - Fork 41
Expand file tree
/
Copy pathdata_info.py
More file actions
124 lines (92 loc) · 4.53 KB
/
data_info.py
File metadata and controls
124 lines (92 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
def load_data(json_path):
with open(json_path, "r") as f:
return json.load(f)
def filter_data(data):
# filtered_data = [item for item in data if "image" in item and "text" in item["image"]]
filtered_data = [item for item in data if "image" in item]
return filtered_data
from multiprocessing import Pool
import functools
def calculate_image_dimension(item, images_folder):
image_path = os.path.join(images_folder, item["image"])
try:
with Image.open(image_path) as img:
width, height = img.size
return width, height
except Exception as e:
print(f"Error opening {image_path}: {e}")
return None, None
def calculate_image_dimensions_multiprocess(filtered_data, images_folder, num_processes=256):
with Pool(num_processes) as p:
dimensions = list(tqdm(p.imap(functools.partial(calculate_image_dimension, images_folder=images_folder), filtered_data), total=len(filtered_data), desc="Calculating image dimensions"))
widths, heights = zip(*[dim for dim in dimensions if dim[0] is not None])
return list(widths), list(heights)
def tokenize(text):
return text.split()
def calculate_tokenized_lengths(data):
lengths = []
for item in tqdm(data, desc="Tokenizing conversations"):
for conversation in item["conversations"]:
tokenized_value = tokenize(conversation["value"])
lengths.append(len(tokenized_value))
return lengths
import argparse
def main():
parser = argparse.ArgumentParser(description="Process data for LLaVA_Next project.")
parser.add_argument("--json_path", type=str, help="Path to the JSON file containing data.")
parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing images.")
args = parser.parse_args()
llava_instruct_name = args.json_path.split("/")[-1].replace(".json", "")
json_path = args.json_path
llava_instruct_name = os.path.basename(json_path).replace(".json", "")
images_folder = args.images_folder
data = load_data(json_path)
filtered_data = filter_data(data)
if len(filtered_data) != 0:
print(f"Total data items: {len(data)}, Filtered data items: {len(filtered_data)}")
widths, heights = calculate_image_dimensions_multiprocess(filtered_data, images_folder)
max_width = max(widths)
max_height = max(heights)
print(f"Max width: {max_width}, Max height: {max_height}")
tokenized_lengths = calculate_tokenized_lengths(data)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 12))
if len(filtered_data) != 0:
# Plot 2D histogram
if min(widths) == max(widths):
widths_bins = [min(widths), max(widths) + 1]
else:
widths_bins = np.arange(min(widths), max(widths) + 100, 100)
if min(heights) == max(heights):
heights_bins = [min(heights), max(heights) + 1]
else:
heights_bins = np.arange(min(heights), max(heights) + 100, 100)
h, xedges, yedges, image = ax1.hist2d(widths, heights, bins=[widths_bins, heights_bins], cmap=plt.cm.jet, density=True)
fig.colorbar(image, ax=ax1)
ax1.set_xlabel("Width")
ax1.set_ylabel("Height")
ax1.set_title(f"dist_{llava_instruct_name}_2d_w_h\nMax width: {max(widths)}, Max height: {max(heights)}", fontsize=10)
# Plot histogram
hist, bin_edges = np.histogram(tokenized_lengths, bins=np.arange(0, max(tokenized_lengths) + 10, 100))
bins = np.arange(0, max(tokenized_lengths) + 10, 100)
ax2.bar(bin_edges[:-1], hist, width=7, edgecolor="black", log=True)
# Display every nth label on the x-axis
n = 8 # Adjust this value to control the number of labels displayed
ticks = bins[::n]
tick_labels = [int(tick) for tick in ticks]
ax2.set_xticks(ticks)
ax2.set_xticklabels(tick_labels, rotation=90, fontsize=8)
ax2.set_xlim(min(bin_edges), max(bin_edges))
ax2.set_xlabel("Tokenized Length")
ax2.set_ylabel("Count (log scale)")
ax2.set_title(f"dist_{llava_instruct_name}_tokenized_length", fontsize=8)
plt.tight_layout()
plt.savefig(f"/mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/notebooks/sft_data/dist_{llava_instruct_name}_combined.png")
print(f"Plots saved to /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/notebooks/sft_data/dist_{llava_instruct_name}_combined.png")
if __name__ == "__main__":
main()