Skip to content

unitorch.models.siglip¤

SiglipProcessor¤

Bases: HfTextClassificationProcessor

Initializes the SiglipProcessor.

Parameters:

Name Type Description Default
vocab_path str

Path to the vocabulary file.

None
vision_config_path str

Path to the vision configuration file.

None
max_seq_length int

Maximum sequence length for text inputs. Defaults to 128.

128
position_start_id int

Starting position ID. Defaults to 0.

0
Source code in src/unitorch/models/siglip/processing.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    vocab_path: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 128,
    position_start_id: Optional[int] = 0,
):
    """
    Initializes the SiglipProcessor.

    Args:
        vocab_path (str, optional): Path to the vocabulary file.
        vision_config_path (str, optional): Path to the vision configuration file.
        max_seq_length (int, optional): Maximum sequence length for text inputs. Defaults to 128.
        position_start_id (int, optional): Starting position ID. Defaults to 0.
    """
    if vocab_path is not None:
        tokenizer = SiglipTokenizer(vocab_file=vocab_path)
    else:
        tokenizer = SiglipTokenizer.from_pretrained(
            "google/siglip-base-patch16-224"
        )
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=0,
        position_start_id=position_start_id,
    )

    if vision_config_path is not None:
        self.vision_processor = SiglipImageProcessor.from_json_file(
            vision_config_path
        )
    else:
        self.vision_processor = SiglipImageProcessor.from_pretrained(
            "google/siglip-base-patch16-224"
        )

vision_processor instance-attribute ¤

vision_processor = from_json_file(vision_config_path)

text_classification ¤

text_classification(
    text: str, max_seq_length: Optional[int] = None
)

Processes text for classification.

Parameters:

Name Type Description Default
text str

Input text.

required
max_seq_length int

Maximum sequence length. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs

Processed text inputs.

Source code in src/unitorch/models/siglip/processing.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def text_classification(
    self,
    text: str,
    max_seq_length: Optional[int] = None,
):
    """
    Processes text for classification.

    Args:
        text (str): Input text.
        max_seq_length (int, optional): Maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: Processed text inputs.
    """
    outputs = HfTextClassificationProcessor.classification(
        self,
        text=text,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        position_ids=outputs.position_ids,
    )

image_classification ¤

image_classification(image: Union[Image, str])

Processes an image for classification.

Parameters:

Name Type Description Default
image Image or str

Input image or path.

required

Returns:

Name Type Description
GenericOutputs

Processed image inputs.

Source code in src/unitorch/models/siglip/processing.py
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def image_classification(
    self,
    image: Union[Image.Image, str],
):
    """
    Processes an image for classification.

    Args:
        image (PIL.Image.Image or str): Input image or path.

    Returns:
        GenericOutputs: Processed image inputs.
    """
    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]
    return GenericOutputs(pixel_values=pixel_values)

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Processes text and image for multimodal classification.

Parameters:

Name Type Description Default
text str

Input text.

required
image Image or str

Input image or path.

required
max_seq_length int

Maximum sequence length. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs

Processed text and image inputs.

Source code in src/unitorch/models/siglip/processing.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    """
    Processes text and image for multimodal classification.

    Args:
        text (str): Input text.
        image (PIL.Image.Image or str): Input image or path.
        max_seq_length (int, optional): Maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: Processed text and image inputs.
    """
    text_outputs = self.text_classification(
        text=text, max_seq_length=max_seq_length
    )
    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]
    return GenericOutputs(
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        position_ids=text_outputs.position_ids,
        pixel_values=pixel_values,
    )

SiglipForPretrain¤

Bases: GenericModel

Siglip model for pretraining.

Initializes the SiglipForPretrain model.

Parameters:

Name Type Description Default
config_path str

Path to the model configuration file.

required
freeze_base_model bool

Whether to freeze the base model parameters. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
use_all_gather bool

Whether to use all-gather for distributed training. Defaults to True.

True
Source code in src/unitorch/models/siglip/modeling.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(
    self,
    config_path: str,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
    use_all_gather: Optional[bool] = True,
):
    """
    Initializes the SiglipForPretrain model.

    Args:
        config_path (str): Path to the model configuration file.
        freeze_base_model (bool, optional): Whether to freeze the base model parameters. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
        use_all_gather (bool, optional): Whether to use all-gather for distributed training. Defaults to True.
    """
    super().__init__()
    config = SiglipConfig.from_json_file(config_path)
    text_config = config.text_config
    vision_config = config.vision_config
    text_config.gradient_checkpointing = gradient_checkpointing
    vision_config.gradient_checkpointing = gradient_checkpointing
    vision_config.vision_use_head = True

    self.use_all_gather = use_all_gather
    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size

    self.text_model = SiglipTextModel(text_config)
    self.vision_model = SiglipVisionModel(vision_config)
    self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False
        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

use_all_gather instance-attribute ¤

use_all_gather = use_all_gather

text_embed_dim instance-attribute ¤

text_embed_dim = hidden_size

vision_embed_dim instance-attribute ¤

vision_embed_dim = hidden_size

text_model instance-attribute ¤

text_model = SiglipTextModel(text_config)

vision_model instance-attribute ¤

vision_model = SiglipVisionModel(vision_config)

logit_scale instance-attribute ¤

logit_scale = Parameter(ones([]) * logit_scale_init_value)

_all_gather ¤

_all_gather(input: Tensor) -> Tensor

Performs all-gather on the input tensor across distributed processes.

Parameters:

Name Type Description Default
input Tensor

Input tensor to gather.

required

Returns:

Type Description
Tensor

torch.Tensor: Gathered tensor.

Source code in src/unitorch/models/siglip/modeling.py
67
68
69
70
71
72
73
74
75
76
77
78
def _all_gather(self, input: torch.Tensor) -> torch.Tensor:
    """
    Performs all-gather on the input tensor across distributed processes.

    Args:
        input (torch.Tensor): Input tensor to gather.

    Returns:
        torch.Tensor: Gathered tensor.
    """
    output = AllGather.apply(input)
    return output.view(-1, *(output.shape[2:]))

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the SiglipForPretrain model.

Parameters:

Name Type Description Default
input_ids Tensor

Input text token IDs.

required
pixel_values Tensor

Input image pixel values.

required
attention_mask Tensor

Attention mask for the input.

required
position_ids Tensor

Position IDs for the input tokens.

required

Returns:

Type Description

torch.Tensor: Contrastive loss.

Source code in src/unitorch/models/siglip/modeling.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the SiglipForPretrain model.

    Args:
        input_ids (torch.Tensor): Input text token IDs.
        pixel_values (torch.Tensor): Input image pixel values.
        attention_mask (torch.Tensor): Attention mask for the input.
        position_ids (torch.Tensor): Position IDs for the input tokens.

    Returns:
        torch.Tensor: Contrastive loss.
    """
    vision_outputs = self.vision_model(pixel_values=pixel_values)
    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )

    image_embeds = vision_outputs[1]
    text_embeds = text_outputs[1]
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    logit_scale = self.logit_scale.exp()
    if self.use_all_gather and dist.is_initialized():
        text_embeds = self._all_gather(text_embeds)
        image_embeds = self._all_gather(image_embeds)

    logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
    return _clip_loss(logits_per_text)

SiglipForClassification¤

Bases: GenericModel

Siglip model for multimodal classification.

Parameters:

Name Type Description Default
config_path str

Path to the Siglip configuration file.

required
num_classes int

Number of output classes. Defaults to 1.

1
freeze_base_model bool

Whether to freeze the base model. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/siglip/modeling.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Siglip model for multimodal classification.

    Args:
        config_path (str): Path to the Siglip configuration file.
        num_classes (int, optional): Number of output classes. Defaults to 1.
        freeze_base_model (bool, optional): Whether to freeze the base model. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = SiglipConfig.from_json_file(config_path)
    text_config = config.text_config
    vision_config = config.vision_config
    text_config.gradient_checkpointing = gradient_checkpointing
    vision_config.gradient_checkpointing = gradient_checkpointing
    vision_config.vision_use_head = True

    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size

    self.text_model = SiglipTextModel(text_config)
    self.vision_model = SiglipVisionModel(vision_config)
    self.classifier = nn.Linear(
        self.text_embed_dim + self.vision_embed_dim, num_classes
    )

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False
        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

text_embed_dim instance-attribute ¤

text_embed_dim = hidden_size

vision_embed_dim instance-attribute ¤

vision_embed_dim = hidden_size

text_model instance-attribute ¤

text_model = SiglipTextModel(text_config)

vision_model instance-attribute ¤

vision_model = SiglipVisionModel(vision_config)

classifier instance-attribute ¤

classifier = Linear(
    text_embed_dim + vision_embed_dim, num_classes
)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the SiglipForClassification model.

Parameters:

Name Type Description Default
input_ids Tensor

Input text token IDs.

required
pixel_values Tensor

Input image pixel values.

required
attention_mask Tensor

Attention mask for the input.

required
position_ids Tensor

Position IDs for the input tokens.

required

Returns:

Type Description

torch.Tensor: Classification logits.

Source code in src/unitorch/models/siglip/modeling.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the SiglipForClassification model.

    Args:
        input_ids (torch.Tensor): Input text token IDs.
        pixel_values (torch.Tensor): Input image pixel values.
        attention_mask (torch.Tensor): Attention mask for the input.
        position_ids (torch.Tensor): Position IDs for the input tokens.

    Returns:
        torch.Tensor: Classification logits.
    """
    vision_outputs = self.vision_model(pixel_values=pixel_values)
    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )
    image_embeds = vision_outputs[1]
    text_embeds = text_outputs[1]
    return self.classifier(F.relu(torch.cat([image_embeds, text_embeds], dim=1)))

SiglipForTextClassification¤

Bases: GenericModel

Siglip model for text classification.

Parameters:

Name Type Description Default
config_path str

Path to the Siglip configuration file.

required
num_classes int

Number of output classes. Defaults to 1.

1
freeze_base_model bool

Whether to freeze the base model. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/siglip/modeling.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Siglip model for text classification.

    Args:
        config_path (str): Path to the Siglip configuration file.
        num_classes (int, optional): Number of output classes. Defaults to 1.
        freeze_base_model (bool, optional): Whether to freeze the base model. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = SiglipConfig.from_json_file(config_path)
    text_config = config.text_config
    text_config.gradient_checkpointing = gradient_checkpointing

    self.text_embed_dim = text_config.hidden_size
    self.text_model = SiglipTextModel(text_config)
    self.classifier = nn.Linear(self.text_embed_dim, num_classes)

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing

text_embed_dim instance-attribute ¤

text_embed_dim = hidden_size

text_model instance-attribute ¤

text_model = SiglipTextModel(text_config)

classifier instance-attribute ¤

classifier = Linear(text_embed_dim, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the SiglipForTextClassification model.

Parameters:

Name Type Description Default
input_ids Tensor

Input token IDs.

required
attention_mask Tensor

Attention mask.

required
position_ids Tensor

Position IDs.

required

Returns:

Type Description

torch.Tensor: Classification logits.

Source code in src/unitorch/models/siglip/modeling.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the SiglipForTextClassification model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        attention_mask (torch.Tensor): Attention mask.
        position_ids (torch.Tensor): Position IDs.

    Returns:
        torch.Tensor: Classification logits.
    """
    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )
    text_embeds = text_outputs[1]
    return self.classifier(F.relu(text_embeds))

SiglipForImageClassification¤

Bases: GenericModel

Siglip model for image classification.

Parameters:

Name Type Description Default
config_path str

Path to the Siglip configuration file.

required
num_classes int

Number of output classes. Defaults to 1.

1
freeze_base_model bool

Whether to freeze the base model. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/siglip/modeling.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Siglip model for image classification.

    Args:
        config_path (str): Path to the Siglip configuration file.
        num_classes (int, optional): Number of output classes. Defaults to 1.
        freeze_base_model (bool, optional): Whether to freeze the base model. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = SiglipConfig.from_json_file(config_path)
    vision_config = config.vision_config
    vision_config.gradient_checkpointing = gradient_checkpointing
    vision_config.vision_use_head = True

    self.vision_embed_dim = vision_config.hidden_size
    self.vision_model = SiglipVisionModel(vision_config)
    self.classifier = nn.Linear(self.vision_embed_dim, num_classes)

    self.init_weights()

    if freeze_base_model:
        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

vision_embed_dim instance-attribute ¤

vision_embed_dim = hidden_size

vision_model instance-attribute ¤

vision_model = SiglipVisionModel(vision_config)

classifier instance-attribute ¤

classifier = Linear(vision_embed_dim, num_classes)

forward ¤

forward(pixel_values: Tensor)

Forward pass of the SiglipForImageClassification model.

Parameters:

Name Type Description Default
pixel_values Tensor

Input image pixel values.

required

Returns:

Type Description

torch.Tensor: Classification logits.

Source code in src/unitorch/models/siglip/modeling.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def forward(
    self,
    pixel_values: torch.Tensor,
):
    """
    Forward pass of the SiglipForImageClassification model.

    Args:
        pixel_values (torch.Tensor): Input image pixel values.

    Returns:
        torch.Tensor: Classification logits.
    """
    vision_outputs = self.vision_model(pixel_values=pixel_values)
    image_embeds = vision_outputs[1]
    return self.classifier(F.relu(image_embeds))

SiglipForMatching¤

Bases: GenericModel, PeftWeightLoaderMixin

Siglip model for image-text matching.

Parameters:

Name Type Description Default
config_path str

Path to the Siglip configuration file.

required
freeze_base_model bool

Whether to freeze the base model. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/siglip/modeling.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def __init__(
    self,
    config_path: str,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Siglip model for image-text matching.

    Args:
        config_path (str): Path to the Siglip configuration file.
        freeze_base_model (bool, optional): Whether to freeze the base model. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = SiglipConfig.from_json_file(config_path)
    text_config = config.text_config
    vision_config = config.vision_config
    text_config.gradient_checkpointing = gradient_checkpointing
    vision_config.gradient_checkpointing = gradient_checkpointing
    vision_config.vision_use_head = True

    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size
    assert self.text_embed_dim == self.vision_embed_dim

    self.text_model = SiglipTextModel(text_config)
    self.vision_model = SiglipVisionModel(vision_config)
    self.classifier = nn.Linear(1, 1)

    self.init_weights()
    self.classifier.weight.data.fill_(5.0)

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False
        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

replace_keys_in_peft_state_dict class-attribute instance-attribute ¤

replace_keys_in_peft_state_dict = {
    "peft_model.base_model.model.": ""
}

text_embed_dim instance-attribute ¤

text_embed_dim = hidden_size

vision_embed_dim instance-attribute ¤

vision_embed_dim = hidden_size

text_model instance-attribute ¤

text_model = SiglipTextModel(text_config)

vision_model instance-attribute ¤

vision_model = SiglipVisionModel(vision_config)

classifier instance-attribute ¤

classifier = Linear(1, 1)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the SiglipForMatching model.

Parameters:

Name Type Description Default
input_ids Tensor

Input text token IDs.

required
pixel_values Tensor

Input image pixel values.

required
attention_mask Tensor

Attention mask for the input.

required
position_ids Tensor

Position IDs for the input tokens.

required

Returns:

Type Description

torch.Tensor: Matching scores.

Source code in src/unitorch/models/siglip/modeling.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the SiglipForMatching model.

    Args:
        input_ids (torch.Tensor): Input text token IDs.
        pixel_values (torch.Tensor): Input image pixel values.
        attention_mask (torch.Tensor): Attention mask for the input.
        position_ids (torch.Tensor): Position IDs for the input tokens.

    Returns:
        torch.Tensor: Matching scores.
    """
    vision_outputs = self.vision_model(pixel_values=pixel_values)
    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )
    image_embeds = vision_outputs[1]
    text_embeds = text_outputs[1]

    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    scores = torch.sum(text_embeds * image_embeds, dim=-1, keepdim=True)
    return self.classifier(scores)