Skip to content

unitorch.models.clip¤

ClipProcessor¤

Bases: HfImageClassificationProcessor, HfTextClassificationProcessor

Initializes the ClipProcessor.

Parameters:

Name Type Description Default
vocab_path str

The path to the vocabulary file.

None
merge_path str

The path to the merge file.

None
vision_config_path str

The path to the vision configuration file.

None
max_seq_length int

The maximum sequence length for text inputs. Defaults to 128.

128
position_start_id int

The starting position ID for positional embeddings. Defaults to 0.

0
Source code in src/unitorch/models/clip/processing.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __init__(
    self,
    vocab_path: Optional[str] = None,
    merge_path: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 128,
    position_start_id: Optional[int] = 0,
):
    """
    Initializes the ClipProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        merge_path (str): The path to the merge file.
        vision_config_path (str): The path to the vision configuration file.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to 128.
        position_start_id (int, optional): The starting position ID for positional embeddings. Defaults to 0.
    """
    if vision_config_path is not None:
        vision_processor = CLIPImageProcessor.from_json_file(vision_config_path)
    else:
        vision_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32"
        )
    HfImageClassificationProcessor.__init__(
        self,
        vision_processor=vision_processor,
    )

    if vocab_path is not None and merge_path is not None:
        tokenizer = CLIPTokenizer(
            vocab_file=vocab_path,
            merges_file=merge_path,
        )
    else:
        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=0,
        position_start_id=position_start_id,
    )

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Performs classification using text and image inputs.

Parameters:

Name Type Description Default
text str

The input text.

required
image Image

The input image.

required
max_seq_length int

The maximum sequence length for text inputs. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs

An object containing the processed inputs.

Source code in src/unitorch/models/clip/processing.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    """
    Performs classification using text and image inputs.

    Args:
        text (str): The input text.
        image (PIL.Image.Image): The input image.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to None.

    Returns:
        GenericOutputs: An object containing the processed inputs.
    """
    text_outputs = self.text_classification(
        text=text,
        max_seq_length=max_seq_length,
    )
    pixel_outputs = self.image_classification(
        image=image,
    )

    return GenericOutputs(
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        position_ids=text_outputs.position_ids,
        pixel_values=pixel_outputs.pixel_values,
    )

image_classification ¤

image_classification(image: Union[Image, str])

Performs image classification.

Parameters:

Name Type Description Default
image Image

The input image.

required

Returns:

Name Type Description
GenericOutputs

An object containing the processed inputs.

Source code in src/unitorch/models/clip/processing.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def image_classification(
    self,
    image: Union[Image.Image, str],
):
    """
    Performs image classification.

    Args:
        image (PIL.Image.Image): The input image.

    Returns:
        GenericOutputs: An object containing the processed inputs.
    """
    outputs = HfImageClassificationProcessor.classification(
        self,
        image=image,
    )

    return GenericOutputs(
        pixel_values=outputs.pixel_values,
    )

text_classification ¤

text_classification(
    text: str, max_seq_length: Optional[int] = None
)

Performs text classification.

Parameters:

Name Type Description Default
text str

The input text.

required
max_seq_length int

The maximum sequence length for text inputs. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs

An object containing the processed inputs.

Source code in src/unitorch/models/clip/processing.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def text_classification(
    self,
    text: str,
    max_seq_length: Optional[int] = None,
):
    """
    Performs text classification.

    Args:
        text (str): The input text.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to None.

    Returns:
        GenericOutputs: An object containing the processed inputs.
    """
    outputs = HfTextClassificationProcessor.classification(
        self,
        text=text,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        position_ids=outputs.position_ids,
    )

ClipForPretrain¤

Bases: GenericModel

Clip model for pretraining.

Initializes the ClipForPretrain model.

Parameters:

Name Type Description Default
config_path str

Path to the model configuration file.

required
projection_dim int

Dimension of the projected embeddings. Defaults to 512.

512
freeze_base_model bool

Whether to freeze the base model parameters. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
use_all_gather bool

Whether to use all-gather operation. Defaults to True.

True
Source code in src/unitorch/models/clip/modeling.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __init__(
    self,
    config_path: str,
    projection_dim: Optional[int] = 512,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
    use_all_gather: Optional[bool] = True,
):
    """
    Initializes the ClipForPretrain model.

    Args:
        config_path (str): Path to the model configuration file.
        projection_dim (int, optional): Dimension of the projected embeddings. Defaults to 512.
        freeze_base_model (bool, optional): Whether to freeze the base model parameters. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
        use_all_gather (bool, optional): Whether to use all-gather operation. Defaults to True.
    """
    super().__init__()

    config = CLIPConfig.from_json_file(config_path)
    text_config = config.text_config
    vision_config = config.vision_config
    text_config.gradient_checkpointing = gradient_checkpointing
    vision_config.gradient_checkpointing = gradient_checkpointing

    self.projection_dim = projection_dim
    self.use_all_gather = use_all_gather

    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size

    self.text_model = CLIPTextTransformer(text_config)
    self.vision_model = CLIPVisionTransformer(vision_config)

    self.visual_projection = nn.Linear(
        self.vision_embed_dim,
        self.projection_dim,
        bias=False,
    )
    self.text_projection = nn.Linear(
        self.text_embed_dim,
        self.projection_dim,
        bias=False,
    )
    self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False

        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the Clip model.

Parameters:

Name Type Description Default
input_ids Tensor

Input text token IDs. Defaults to None.

required
pixel_values Tensor

Input image pixel values. Defaults to None.

required
attention_mask Tensor

Attention mask for the input. Defaults to None.

required
position_ids Tensor

Position IDs for the input tokens. Defaults to None.

required
output_attentions bool

Whether to output attentions. Defaults to None.

required
output_hidden_states bool

Whether to output hidden states. Defaults to None.

required

Returns:

Type Description
Tensor

Logits per text.

Source code in src/unitorch/models/clip/modeling.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the Clip model.

    Args:
        input_ids (torch.Tensor, optional): Input text token IDs. Defaults to None.
        pixel_values (torch.Tensor, optional): Input image pixel values. Defaults to None.
        attention_mask (torch.Tensor, optional): Attention mask for the input. Defaults to None.
        position_ids (torch.Tensor, optional): Position IDs for the input tokens. Defaults to None.
        output_attentions (bool, optional): Whether to output attentions. Defaults to None.
        output_hidden_states (bool, optional): Whether to output hidden states. Defaults to None.

    Returns:
        (torch.Tensor):Logits per text.
    """
    vision_outputs = self.vision_model(
        pixel_values=pixel_values,
    )

    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )

    image_embeds = vision_outputs[1]
    image_embeds = self.visual_projection(image_embeds)

    text_embeds = text_outputs[1]
    text_embeds = self.text_projection(text_embeds)

    # normalized features
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    logit_scale = self.logit_scale.exp()
    if self.use_all_gather and dist.is_initialized():
        text_embeds = self._all_gather(text_embeds)
        image_embeds = self._all_gather(image_embeds)
    logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
    return _clip_loss(logits_per_text)

ClipForClassification¤

Bases: GenericModel

Clip model for classification.

Parameters:

Name Type Description Default
config_path str

Config file path to Clip model.

required
projection_dim int

Dimension for image/text output embedding.

512
num_classes int

Number of classes for classification.

1
freeze_base_model bool

Whether to freeze the base model.

True
gradient_checkpointing Optional[bool]

Whether to enable gradient_checkpointing.

False
Source code in src/unitorch/models/clip/modeling.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def __init__(
    self,
    config_path: str,
    projection_dim: Optional[int] = 512,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Clip model for classification.

    Args:
        config_path (str): Config file path to Clip model.
        projection_dim (int): Dimension for image/text output embedding.
        num_classes (int): Number of classes for classification.
        freeze_base_model (bool): Whether to freeze the base model.
        gradient_checkpointing (Optional[bool]): Whether to enable gradient_checkpointing.
    """
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    text_config = config.text_config
    vision_config = config.vision_config
    text_config.gradient_checkpointing = gradient_checkpointing
    vision_config.gradient_checkpointing = gradient_checkpointing

    self.projection_dim = projection_dim

    self.text_embed_dim = text_config.hidden_size
    self.vision_embed_dim = vision_config.hidden_size

    self.text_model = CLIPTextTransformer(text_config)
    self.vision_model = CLIPVisionTransformer(vision_config)

    self.visual_projection = nn.Linear(
        self.vision_embed_dim,
        self.projection_dim,
        bias=False,
    )
    self.text_projection = nn.Linear(
        self.text_embed_dim,
        self.projection_dim,
        bias=False,
    )

    self.classifier = nn.Linear(self.projection_dim * 2, num_classes)

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False

        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the Clip model for classification.

Parameters:

Name Type Description Default
input_ids tensor

Tokens of text.

required
pixel_values tensor

Pixels of image.

required
attention_mask tensor

Attention mask of tokens.

required
position_ids tensor

Position IDs.

required
output_attentions bool

Whether to output attentions.

required
output_hidden_states bool

Whether to output hidden states.

required

Returns:

Name Type Description
tensor

Output tensor from the classifier.

Source code in src/unitorch/models/clip/modeling.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the Clip model for classification.

    Args:
        input_ids (tensor): Tokens of text.
        pixel_values (tensor): Pixels of image.
        attention_mask (tensor): Attention mask of tokens.
        position_ids (tensor): Position IDs.
        output_attentions (bool): Whether to output attentions.
        output_hidden_states (bool): Whether to output hidden states.

    Returns:
        tensor: Output tensor from the classifier.
    """
    vision_outputs = self.vision_model(
        pixel_values=pixel_values,
    )

    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )

    image_embeds = vision_outputs[1]
    image_embeds = self.visual_projection(image_embeds)

    text_embeds = text_outputs[1]
    text_embeds = self.text_projection(text_embeds)

    return self.classifier(F.relu(torch.cat([image_embeds, text_embeds], axis=1)))

ClipForTextClassification¤

Bases: GenericModel

Initializes the Clip model for text classification.

Parameters:

Name Type Description Default
config_path str

The path to the CLIP configuration file.

required
projection_dim int

The dimension of the projection layer. Defaults to 512.

512
num_classes int

The number of classes for classification. Defaults to 1.

1
freeze_base_model bool

Whether to freeze the base model parameters. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/clip/modeling.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def __init__(
    self,
    config_path: str,
    projection_dim: Optional[int] = 512,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the Clip model for text classification.

    Args:
        config_path (str): The path to the CLIP configuration file.
        projection_dim (int, optional): The dimension of the projection layer. Defaults to 512.
        num_classes (int, optional): The number of classes for classification. Defaults to 1.
        freeze_base_model (bool, optional): Whether to freeze the base model parameters. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    text_config = config.text_config
    text_config.gradient_checkpointing = gradient_checkpointing

    self.projection_dim = projection_dim
    self.text_embed_dim = text_config.hidden_size

    self.text_model = CLIPTextTransformer(text_config)

    self.text_projection = nn.Linear(
        self.text_embed_dim,
        self.projection_dim,
        bias=False,
    )

    self.classifier = nn.Linear(self.projection_dim, num_classes)

    self.init_weights()

    if freeze_base_model:
        for p in self.text_model.parameters():
            p.requires_grad = False

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
)

Forward pass of the Clip model for text classification.

Parameters:

Name Type Description Default
input_ids Tensor

The input token IDs.

required
attention_mask Tensor

The attention mask.

required
position_ids Tensor

The position IDs.

required

Returns:

Type Description
Tensor

The output logits.

Source code in src/unitorch/models/clip/modeling.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
):
    """
    Forward pass of the Clip model for text classification.

    Args:
        input_ids (torch.Tensor): The input token IDs.
        attention_mask (torch.Tensor): The attention mask.
        position_ids (torch.Tensor): The position IDs.

    Returns:
        (torch.Tensor):The output logits.
    """
    text_outputs = self.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )
    text_embeds = text_outputs[1]
    text_embeds = self.text_projection(text_embeds)

    # normalized features
    # text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    return self.classifier(F.relu(text_embeds))

ClipForImageClassification¤

Bases: GenericModel

Initializes the Clip model for image classification.

Parameters:

Name Type Description Default
config_path str

The path to the CLIP configuration file.

required
projection_dim int

The dimension of the projection layer. Defaults to 512.

512
num_classes int

The number of classes for classification. Defaults to 1.

1
freeze_base_model bool

Whether to freeze the base model parameters. Defaults to True.

True
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/clip/modeling.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def __init__(
    self,
    config_path: str,
    projection_dim: Optional[int] = 512,
    num_classes: Optional[int] = 1,
    freeze_base_model: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the Clip model for image classification.

    Args:
        config_path (str): The path to the CLIP configuration file.
        projection_dim (int, optional): The dimension of the projection layer. Defaults to 512.
        num_classes (int, optional): The number of classes for classification. Defaults to 1.
        freeze_base_model (bool, optional): Whether to freeze the base model parameters. Defaults to True.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    vision_config = config.vision_config
    vision_config.gradient_checkpointing = gradient_checkpointing

    self.projection_dim = projection_dim
    self.vision_embed_dim = vision_config.hidden_size
    self.vision_model = CLIPVisionTransformer(vision_config)
    self.visual_projection = nn.Linear(
        self.vision_embed_dim,
        self.projection_dim,
        bias=False,
    )
    self.classifier = nn.Linear(self.projection_dim, num_classes)
    self.init_weights()

    if freeze_base_model:
        for p in self.vision_model.parameters():
            p.requires_grad = False

    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

forward ¤

forward(pixel_values: Tensor)

Forward pass of the Clip model for image classification.

Parameters:

Name Type Description Default
pixel_values Tensor

The input pixel values.

required

Returns:

Type Description
Tensor

The output logits.

Source code in src/unitorch/models/clip/modeling.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def forward(
    self,
    pixel_values: torch.Tensor,
):
    """
    Forward pass of the Clip model for image classification.

    Args:
        pixel_values (torch.Tensor): The input pixel values.

    Returns:
        (torch.Tensor):The output logits.
    """
    vision_outputs = self.vision_model(
        pixel_values=pixel_values,
    )

    image_embeds = vision_outputs[1]
    image_embeds = self.visual_projection(image_embeds)

    # normalized features
    # image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)

    return self.classifier(F.relu(image_embeds))