unitorch.models.clip¤

ClipProcessor¤

Bases: HfImageClassificationProcessor, HfTextClassificationProcessor

Multimodal processor for CLIP models.

Source code in src/unitorch/models/clip/processing.py

def __init__(
    self,
    vocab_path: Optional[str] = None,
    merge_path: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: int = 128,
    position_start_id: int = 0,
) -> None:
    vision_processor = (
        CLIPImageProcessor.from_json_file(vision_config_path)
        if vision_config_path is not None
        else CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
    )
    HfImageClassificationProcessor.__init__(self, vision_processor=vision_processor)

    tokenizer = (
        CLIPTokenizer(vocab_file=vocab_path, merges_file=merge_path)
        if vocab_path is not None and merge_path is not None
        else CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
    )
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=0,
        position_start_id=position_start_id,
    )

text_classification ¤

text_classification(
    text: str, max_seq_length: Optional[int] = None
) -> GenericOutputs

Tokenise text for text classification.

Source code in src/unitorch/models/clip/processing.py

def text_classification(
    self,
    text: str,
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """Tokenise *text* for text classification."""
    outputs = HfTextClassificationProcessor.classification(
        self, text=text, max_seq_length=max_seq_length
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        position_ids=outputs.position_ids,
    )

image_classification ¤

image_classification(
    image: Union[Image, str],
) -> GenericOutputs

Preprocess image for image classification.

Source code in src/unitorch/models/clip/processing.py

def image_classification(self, image: Union[Image.Image, str]) -> GenericOutputs:
    """Preprocess *image* for image classification."""
    return GenericOutputs(
        pixel_values=HfImageClassificationProcessor.classification(
            self, image=image
        ).pixel_values,
    )

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Preprocess a text-image pair for multimodal classification.

Source code in src/unitorch/models/clip/processing.py

def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """Preprocess a text-image pair for multimodal classification."""
    text_out = self.text_classification(text=text, max_seq_length=max_seq_length)
    pixel_out = self.image_classification(image=image)
    return GenericOutputs(
        input_ids=text_out.input_ids,
        attention_mask=text_out.attention_mask,
        position_ids=text_out.position_ids,
        pixel_values=pixel_out.pixel_values,
    )

ClipForPretrain¤

Bases: GenericModel

CLIP model for contrastive image-text pre-training.

Source code in src/unitorch/models/clip/modeling.py

def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
    use_all_gather: bool = True,
) -> None:
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.use_all_gather = use_all_gather
    self.text_model = CLIPTextModel(config.text_config)
    self.vision_model = CLIPVisionModel(config.vision_config)
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)
        _freeze(self.vision_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

use_all_gather `instance-attribute` ¤

use_all_gather = use_all_gather

text_model `instance-attribute` ¤

text_model = CLIPTextModel(text_config)

vision_model `instance-attribute` ¤

vision_model = CLIPVisionModel(vision_config)

text_projection `instance-attribute` ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

visual_projection `instance-attribute` ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

logit_scale `instance-attribute` ¤

logit_scale = Parameter(ones([]) * logit_scale_init_value)

_all_gather ¤

_all_gather(x: Tensor) -> Tensor

Source code in src/unitorch/models/clip/modeling.py

def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
    out = AllGather.apply(x)
    return out.view(-1, *out.shape[2:])

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
) -> Tensor

Source code in src/unitorch/models/clip/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        ).pooler_output
    )
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    if self.use_all_gather and dist.is_initialized():
        text_embeds = self._all_gather(text_embeds)
        image_embeds = self._all_gather(image_embeds)

    logits_per_text = (
        torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale.exp()
    )
    return _clip_loss(logits_per_text)

ClipForClassification¤

Bases: GenericModel

CLIP model for multimodal (image + text) classification.

Source code in src/unitorch/models/clip/modeling.py

def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.text_model = CLIPTextModel(config.text_config)
    self.vision_model = CLIPVisionModel(config.vision_config)
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim * 2, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)
        _freeze(self.vision_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing
    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

text_model `instance-attribute` ¤

text_model = CLIPTextModel(text_config)

vision_model `instance-attribute` ¤

vision_model = CLIPVisionModel(vision_config)

text_projection `instance-attribute` ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

visual_projection `instance-attribute` ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier `instance-attribute` ¤

classifier = Linear(projection_dim * 2, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
) -> Tensor

Source code in src/unitorch/models/clip/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        ).pooler_output
    )
    return self.classifier(F.relu(torch.cat([image_embeds, text_embeds], dim=1)))

ClipForTextClassification¤

Bases: GenericModel

CLIP model for text-only classification.

Source code in src/unitorch/models/clip/modeling.py

def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    config.text_config.gradient_checkpointing = gradient_checkpointing

    self.text_model = CLIPTextModel(config.text_config)
    self.text_projection = nn.Linear(
        config.text_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.text_model)

    self.text_model.encoder.gradient_checkpointing = gradient_checkpointing

text_model `instance-attribute` ¤

text_model = CLIPTextModel(text_config)

text_projection `instance-attribute` ¤

text_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier `instance-attribute` ¤

classifier = Linear(projection_dim, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
) -> Tensor

Source code in src/unitorch/models/clip/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
) -> torch.Tensor:
    text_embeds = self.text_projection(
        self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        ).pooler_output
    )
    return self.classifier(F.relu(text_embeds))

ClipForImageClassification¤

Bases: GenericModel

CLIP model for image-only classification.

Source code in src/unitorch/models/clip/modeling.py

def __init__(
    self,
    config_path: str,
    projection_dim: int = 512,
    num_classes: int = 1,
    freeze_base_model: bool = True,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    config = CLIPConfig.from_json_file(config_path)
    config.vision_config.gradient_checkpointing = gradient_checkpointing

    self.vision_model = CLIPVisionModel(config.vision_config)
    self.visual_projection = nn.Linear(
        config.vision_config.hidden_size, projection_dim, bias=False
    )
    self.classifier = nn.Linear(projection_dim, num_classes)
    self.init_weights()

    if freeze_base_model:
        _freeze(self.vision_model)

    self.vision_model.encoder.gradient_checkpointing = gradient_checkpointing

vision_model `instance-attribute` ¤

vision_model = CLIPVisionModel(vision_config)

visual_projection `instance-attribute` ¤

visual_projection = Linear(
    hidden_size, projection_dim, bias=False
)

classifier `instance-attribute` ¤

classifier = Linear(projection_dim, num_classes)

forward ¤

forward(pixel_values: Tensor) -> Tensor

Source code in src/unitorch/models/clip/modeling.py

def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
    image_embeds = self.visual_projection(
        self.vision_model(pixel_values=pixel_values).pooler_output
    )
    return self.classifier(F.relu(image_embeds))

unitorch.models.clip¤

ClipProcessor¤

text_classification ¤

image_classification ¤

classification ¤

ClipForPretrain¤

use_all_gather instance-attribute ¤

text_model instance-attribute ¤

vision_model instance-attribute ¤

text_projection instance-attribute ¤

visual_projection instance-attribute ¤

logit_scale instance-attribute ¤

_all_gather ¤

forward ¤

ClipForClassification¤

text_model instance-attribute ¤

vision_model instance-attribute ¤

text_projection instance-attribute ¤

visual_projection instance-attribute ¤

classifier instance-attribute ¤

forward ¤

ClipForTextClassification¤

text_model instance-attribute ¤

text_projection instance-attribute ¤

classifier instance-attribute ¤

forward ¤

ClipForImageClassification¤

vision_model instance-attribute ¤

visual_projection instance-attribute ¤

classifier instance-attribute ¤

forward ¤

use_all_gather `instance-attribute` ¤

text_model `instance-attribute` ¤

vision_model `instance-attribute` ¤

text_projection `instance-attribute` ¤

visual_projection `instance-attribute` ¤

logit_scale `instance-attribute` ¤

text_model `instance-attribute` ¤

vision_model `instance-attribute` ¤

text_projection `instance-attribute` ¤

visual_projection `instance-attribute` ¤

classifier `instance-attribute` ¤

text_model `instance-attribute` ¤

text_projection `instance-attribute` ¤

classifier `instance-attribute` ¤

vision_model `instance-attribute` ¤

visual_projection `instance-attribute` ¤

classifier `instance-attribute` ¤