Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
741b889
vit base
sineeli Nov 14, 2024
13dae08
Add vit backbone, classifier and preprocessor layers
sineeli Nov 15, 2024
b64b137
update args
sineeli Nov 15, 2024
429d635
add default args
sineeli Nov 15, 2024
6d69abc
correct build method
sineeli Nov 15, 2024
2e87884
fix build issues
sineeli Nov 15, 2024
bd3cce0
fix bugs
sineeli Nov 16, 2024
4232a06
Update backbone args and configs
sineeli Nov 18, 2024
32b08c5
correct position ids dtype
sineeli Nov 18, 2024
cc938c6
build token layer
sineeli Nov 18, 2024
78812de
token layer build
sineeli Nov 18, 2024
8a20465
assign correct dtype to TokenLayer
sineeli Nov 18, 2024
de754cc
fix build shape of token layer
sineeli Nov 18, 2024
84ba896
correct mlp dens var names
sineeli Nov 18, 2024
7a70e16
use default norm mean and std as per hugging face config
sineeli Nov 18, 2024
81e3021
correct position_ids
sineeli Nov 19, 2024
d3061d6
remove separate token layer
sineeli Nov 19, 2024
618e163
correct position ids
sineeli Nov 19, 2024
2338637
Checkpoint conversion script and minor changes
sineeli Nov 21, 2024
95e5868
correct flag type
sineeli Nov 21, 2024
9d2e5bd
correct key name
sineeli Nov 21, 2024
ac7d1d3
use flat list later we can extract in between layers if needed
sineeli Nov 21, 2024
8065c01
Add test cases and correct dtype polciy for model
sineeli Nov 21, 2024
a8be824
add proper docstrings
sineeli Nov 21, 2024
3f027a0
correct test cases
sineeli Nov 22, 2024
05acb70
use numpy for test data
sineeli Nov 25, 2024
521df6f
nit
sineeli Nov 25, 2024
ae2b800
nit
sineeli Nov 27, 2024
26c2224
Merge branch 'master' into sineeli/ViT
sineeli Dec 2, 2024
92149d5
add presets
sineeli Dec 2, 2024
5374c70
load vit preset from hugging face directly
sineeli Dec 5, 2024
ebee9ef
nit
sineeli Dec 5, 2024
93064bd
handle num classes case for ViT
sineeli Dec 5, 2024
e206e7b
replace toke with first
sineeli Dec 9, 2024
7a39d5b
convert all vit checkpoints using tools
sineeli Dec 10, 2024
0827954
Add custom ImageClassifier for ViT
sineeli Dec 10, 2024
ae9319a
remove token pooling and rename representation_size to intermediate_dim
sineeli Dec 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix bugs
  • Loading branch information
sineeli committed Nov 16, 2024
commit bd3cce0a1e4d4d69d1f42b64b7f482a474144151
9 changes: 7 additions & 2 deletions keras_hub/src/models/image_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ def __init__(
dtype=head_dtype,
name="pooler",
)
elif pooling == "token":
self.pooler = None
else:
raise ValueError(
"Unknown `pooling` type. Polling should be either `'avg'` or "
f"`'max'`. Received: pooling={pooling}."
f"`'max' or 'token'`. Received: pooling={pooling}."
)
self.output_dropout = keras.layers.Dropout(
dropout,
Expand All @@ -137,7 +139,10 @@ def __init__(
# === Functional Model ===
inputs = self.backbone.input
x = self.backbone(inputs)
x = self.pooler(x)
if pooling == "token": # used for Vision Transformer(ViT)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"token" feels like a bit a weird name here, especially when compared to "avg" or "max". Maybe "first"?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually wouldn't this also break for other classifier types? I think this "token" pooling would fail to actually pool over a 2d output from most backbone, and similarly global avg 2d pooling would fail to pool correctly for a vit backbone right (since it's a 1d sequence after patching)? Instead we should subclass here, and not let pooling be configurable for vit. See https://github.com/keras-team/keras-hub/blob/master/keras_hub/src/models/vgg/vgg_image_classifier.py as an example of this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, I was thinking earlier to subclass and totally write a new one. Thanks for point out I will make the changes required.

Copy link
Collaborator Author

@sineeli sineeli Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mattdangerw

Also from hugging face I observed that there is one more dense layer if the model is not used for ImageClassification which they call pooling layer and it just has a dense layer(which just projects the same number of hidden dimension) and a tanh activation.

Should we include this, if we are consider for ImageClassification this layer wouldn't be present.

ViTModel: https://github.com/huggingface/transformers/blob/91b8ab18b778ae9e2f8191866e018cd1dc7097be/src/transformers/models/vit/modeling_vit.py#L576

Image Classification: https://github.com/huggingface/transformers/blob/91b8ab18b778ae9e2f8191866e018cd1dc7097be/src/transformers/models/vit/modeling_vit.py#L823C37-L823C54

Any thoughts ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x = x[:, 0]
else:
x = self.pooler(x)
x = self.output_dropout(x)
outputs = self.output_dense(x)
super().__init__(
Expand Down
4 changes: 1 addition & 3 deletions keras_hub/src/models/vit/vit_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
dtype=dtype,
)(inputs)

x = ViTEncoder(
output = ViTEncoder(
num_layers=num_layers,
num_heads=num_heads,
hidden_dim=hidden_dim,
Expand All @@ -64,8 +64,6 @@ def __init__(
dtype=dtype,
)(x)

output = x[:, 0]

super().__init__(
inputs=inputs,
outputs=output,
Expand Down
49 changes: 0 additions & 49 deletions keras_hub/src/models/vit/vit_image_classifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import keras

from keras_hub.src.api_export import keras_hub_export
from keras_hub.src.models.image_classifier import ImageClassifier
from keras_hub.src.models.vit.vit_backbone import ViTBackbone
Expand All @@ -12,50 +10,3 @@
class ViTImageClassifier(ImageClassifier):
backbone_cls = ViTBackbone
preprocessor_cls = ViTImageClassifierPreprocessor

def __init__(
self,
backbone,
num_classes,
preprocessor=None,
activation=None,
head_dtype=None,
**kwargs,
):
head_dtype = head_dtype or backbone.dtype_policy

# === Layers ===
self.backbone = backbone
self.preprocessor = preprocessor

self.output_dense = keras.layers.Dense(
num_classes,
activation=activation,
dtype=head_dtype,
name="predictions",
)

# === Functional Model ===
inputs = self.backbone.input
x = self.backbone(inputs)
outputs = self.output_dense(x)
super().__init__(
inputs=inputs,
outputs=outputs,
**kwargs,
)

# === Config ===
self.num_classes = num_classes
self.activation = activation

def get_config(self):
# Backbone serialized in `super`
config = super().get_config()
config.update(
{
"num_classes": self.num_classes,
"pooling": self.pooling,
}
)
return config