Skip to content

unitorch.models.chinese_clip¤

ChineseClipProcessor¤

Bases: HfImageClassificationProcessor, HfTextClassificationProcessor

Multimodal processor for Chinese CLIP models.

Source code in src/unitorch/models/chinese_clip/processing.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(
    self,
    vocab_path: str,
    vision_config_path: str,
    max_seq_length: int = 128,
    position_start_id: int = 0,
) -> None:
    HfImageClassificationProcessor.__init__(
        self,
        vision_processor=ChineseCLIPImageProcessor.from_json_file(
            vision_config_path
        ),
    )

    tokenizer = BertTokenizer(vocab_file=vocab_path)
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=1,
        position_start_id=position_start_id,
    )

text_classification ¤

text_classification(
    text: str, max_seq_length: Optional[int] = None
) -> GenericOutputs

Tokenise text for text classification.

Source code in src/unitorch/models/chinese_clip/processing.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def text_classification(
    self,
    text: str,
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """Tokenise *text* for text classification."""
    outputs = HfTextClassificationProcessor.classification(
        self, text=text, max_seq_length=max_seq_length
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        position_ids=outputs.position_ids,
    )

image_classification ¤

image_classification(
    image: Union[Image, str],
) -> GenericOutputs

Preprocess image for image classification.

Source code in src/unitorch/models/chinese_clip/processing.py
62
63
64
65
66
67
68
def image_classification(self, image: Union[Image.Image, str]) -> GenericOutputs:
    """Preprocess *image* for image classification."""
    return GenericOutputs(
        pixel_values=HfImageClassificationProcessor.classification(
            self, image=image
        ).pixel_values,
    )

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Preprocess a text-image pair for multimodal classification.

Source code in src/unitorch/models/chinese_clip/processing.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """Preprocess a text-image pair for multimodal classification."""
    text_out = self.text_classification(text=text, max_seq_length=max_seq_length)
    pixel_out = self.image_classification(image=image)
    return GenericOutputs(
        input_ids=text_out.input_ids,
        attention_mask=text_out.attention_mask,
        token_type_ids=text_out.token_type_ids,
        position_ids=text_out.position_ids,
        pixel_values=pixel_out.pixel_values,
    )

ChineseClipForPretrain¤

Bases: GenericModel

Chinese CLIP model for contrastive image-text pre-training.

Source code in src/unitorch/models/chinese_clip/modeling.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
    use_all_gather: bool = True,
) -> None:
    super().__init__()
    config = ChineseCLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.use_all_gather = use_all_gather
    self.text_model = ChineseCLIPTextModel(
        config.text_config, add_pooling_layer=False
    )
    self.vision_model = ChineseCLIPVisionModel(config.vision_config)
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)
        _freeze(self.vision_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

use_all_gather instance-attribute ¤

use_all_gather = use_all_gather

text_model instance-attribute ¤

text_model = ChineseCLIPTextModel(
    text_config, add_pooling_layer=False
)

vision_model instance-attribute ¤

vision_model = ChineseCLIPVisionModel(vision_config)

text_projection instance-attribute ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

visual_projection instance-attribute ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

logit_scale instance-attribute ¤

logit_scale = Parameter(ones([]) * logit_scale_init_value)

_all_gather ¤

_all_gather(x: Tensor) -> Tensor
Source code in src/unitorch/models/chinese_clip/modeling.py
60
61
62
def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
    out = AllGather.apply(x)
    return out.view(-1, *out.shape[2:])

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
) -> Tensor
Source code in src/unitorch/models/chinese_clip/modeling.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
        ).last_hidden_state[:, 0, :]
    )
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    if self.use_all_gather and dist.is_initialized():
        text_embeds = self._all_gather(text_embeds)
        image_embeds = self._all_gather(image_embeds)

    logits_per_text = (
        torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp()
    )
    return _clip_loss(logits_per_text)

ChineseClipForClassification¤

Bases: GenericModel

Chinese CLIP model for multimodal (image + text) classification.

Source code in src/unitorch/models/chinese_clip/modeling.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = ChineseCLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.text_model = ChineseCLIPTextModel(
        config.text_config, add_pooling_layer=False
    )
    self.vision_model = ChineseCLIPVisionModel(config.vision_config)
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim * 2, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)
        _freeze(self.vision_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

text_model instance-attribute ¤

text_model = ChineseCLIPTextModel(
    text_config, add_pooling_layer=False
)

vision_model instance-attribute ¤

vision_model = ChineseCLIPVisionModel(vision_config)

text_projection instance-attribute ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

visual_projection instance-attribute ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier instance-attribute ¤

classifier = Linear(projection_dim * 2, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
) -> Tensor
Source code in src/unitorch/models/chinese_clip/modeling.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
        ).last_hidden_state[:, 0, :]
    )
    return self.classifier(F.relu(torch.cat([image_embeds, text_embeds], dim=1)))

ChineseClipForTextClassification¤

Bases: GenericModel

Chinese CLIP model for text-only classification.

Source code in src/unitorch/models/chinese_clip/modeling.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = ChineseCLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing

    self.text_model = ChineseCLIPTextModel(
        config.text_config, add_pooling_layer=False
    )
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing

text_model instance-attribute ¤

text_model = ChineseCLIPTextModel(
    text_config, add_pooling_layer=False
)

text_projection instance-attribute ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier instance-attribute ¤

classifier = Linear(projection_dim, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
) -> Tensor
Source code in src/unitorch/models/chinese_clip/modeling.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
        ).last_hidden_state[:, 0, :]
    )
    return self.classifier(F.relu(text_embeds))

ChineseClipForImageClassification¤

Bases: GenericModel

Chinese CLIP model for image-only classification.

Source code in src/unitorch/models/chinese_clip/modeling.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = ChineseCLIPConfig.from_json_file(config_path)
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.vision_model = ChineseCLIPVisionModel(config.vision_config)
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.vision_model)

    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

vision_model instance-attribute ¤

vision_model = ChineseCLIPVisionModel(vision_config)

visual_projection instance-attribute ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier instance-attribute ¤

classifier = Linear(projection_dim, num_classes)

forward ¤

forward(pixel_values: Tensor) -> Tensor
Source code in src/unitorch/models/chinese_clip/modeling.py
228
229
230
231
232
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    return self.classifier(F.relu(image_embeds))