unitorch.models.kolors¤

KolorsMPSProcessor¤

Bases: HfImageClassificationProcessor, HfTextClassificationProcessor

Initializes the KolorsMPSProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	Path to the vocabulary file.	`None`
`merge_path`	`str`	Path to the merge file.	`None`
`vision_config_path`	`str`	Path to the vision processor configuration file.	`None`
`max_seq_length`	`int`	Maximum sequence length. Defaults to 77.	`77`
`position_start_id`	`int`	Starting position ID. Defaults to 0.	`0`

Source code in src/unitorch/models/kolors/processing.py

def __init__(
    self,
    vocab_path: Optional[str] = None,
    merge_path: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 77,
    position_start_id: Optional[int] = 0,
):
    """
    Initializes the KolorsMPSProcessor.

    Args:
        vocab_path (str, optional): Path to the vocabulary file.
        merge_path (str, optional): Path to the merge file.
        vision_config_path (str, optional): Path to the vision processor configuration file.
        max_seq_length (int, optional): Maximum sequence length. Defaults to 77.
        position_start_id (int, optional): Starting position ID. Defaults to 0.
    """
    if vision_config_path is not None:
        vision_processor = CLIPImageProcessor.from_json_file(vision_config_path)
    else:
        vision_processor = CLIPImageProcessor.from_pretrained(
            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
        )
    HfImageClassificationProcessor.__init__(self, vision_processor=vision_processor)

    if vocab_path is not None and merge_path is not None:
        tokenizer = CLIPTokenizer(vocab_file=vocab_path, merges_file=merge_path)
    else:
        tokenizer = CLIPTokenizer.from_pretrained(
            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
        )
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        source_type_id=0,
        target_type_id=0,
        position_start_id=position_start_id,
    )

text_classification ¤

text_classification(
    text: str, max_seq_length: Optional[int] = None
)

Processes text for classification.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text.	required
`max_seq_length`	`int`	Maximum sequence length. Defaults to None.	`None`

Returns:

Name	Type	Description
`GenericOutputs`		Processed text inputs.

Source code in src/unitorch/models/kolors/processing.py

def text_classification(
    self,
    text: str,
    max_seq_length: Optional[int] = None,
):
    """
    Processes text for classification.

    Args:
        text (str): Input text.
        max_seq_length (int, optional): Maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: Processed text inputs.
    """
    outputs = HfTextClassificationProcessor.classification(
        self, text=text, max_seq_length=max_seq_length
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        position_ids=outputs.position_ids,
    )

image_classification ¤

image_classification(image: Union[Image, str])

Processes an image for classification.

Parameters:

Name	Type	Description	Default
`image`	`Image or str`	Input image or path.	required

Returns:

Name	Type	Description
`GenericOutputs`		Processed image inputs.

Source code in src/unitorch/models/kolors/processing.py

def image_classification(
    self,
    image: Union[Image.Image, str],
):
    """
    Processes an image for classification.

    Args:
        image (PIL.Image.Image or str): Input image or path.

    Returns:
        GenericOutputs: Processed image inputs.
    """
    outputs = HfImageClassificationProcessor.classification(self, image=image)
    return GenericOutputs(pixel_values=outputs.pixel_values)

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    condition: str,
    max_seq_length: Optional[int] = None,
)

Processes text, image, and condition for multimodal classification.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text.	required
`image`	`Image or str`	Input image or path.	required
`condition`	`str`	Condition text.	required
`max_seq_length`	`int`	Maximum sequence length. Defaults to None.	`None`

Returns:

Name	Type	Description
`GenericOutputs`		Processed text, image, and condition inputs.

Source code in src/unitorch/models/kolors/processing.py

def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    condition: str,
    max_seq_length: Optional[int] = None,
):
    """
    Processes text, image, and condition for multimodal classification.

    Args:
        text (str): Input text.
        image (PIL.Image.Image or str): Input image or path.
        condition (str): Condition text.
        max_seq_length (int, optional): Maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: Processed text, image, and condition inputs.
    """
    text_outputs = self.text_classification(
        text=text, max_seq_length=max_seq_length
    )
    pixel_outputs = self.image_classification(image=image)
    condition_outputs = self.text_classification(
        text=condition, max_seq_length=max_seq_length
    )
    return GenericOutputs(
        input_ids=text_outputs.input_ids,
        attention_mask=text_outputs.attention_mask,
        position_ids=text_outputs.position_ids,
        pixel_values=pixel_outputs.pixel_values,
        condition_input_ids=condition_outputs.input_ids,
        condition_attention_mask=condition_outputs.attention_mask,
        condition_position_ids=condition_outputs.position_ids,
    )

KolorsMPSModel¤

Bases: GenericModel

Initializes the KolorsMPSModel.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the CLIP configuration file.	required

Source code in src/unitorch/models/kolors/modeling.py

def __init__(self, config_path: str):
    """
    Initializes the KolorsMPSModel.

    Args:
        config_path (str): Path to the CLIP configuration file.
    """
    super().__init__()
    self.config = CLIPConfig.from_json_file(config_path)
    self.model = CLIPModel(self.config)
    self.cross_model = CrossModel(dim=1024, layer_num=4, heads=16)

config `instance-attribute` ¤

config = from_json_file(config_path)

model `instance-attribute` ¤

model = CLIPModel(config)

cross_model `instance-attribute` ¤

cross_model = CrossModel(dim=1024, layer_num=4, heads=16)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    position_ids: Tensor,
    pixel_values: Tensor,
    condition_input_ids: Tensor,
    condition_attention_mask: Tensor,
    condition_position_ids: Tensor,
    labels: Optional[Tensor] = None,
)

Forward pass of the KolorsMPSModel.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Text token IDs.	required
`attention_mask`	`Tensor`	Text attention mask.	required
`position_ids`	`Tensor`	Text position IDs.	required
`pixel_values`	`Tensor`	Image pixel values.	required
`condition_input_ids`	`Tensor`	Condition text token IDs.	required
`condition_attention_mask`	`Tensor`	Condition text attention mask.	required
`condition_position_ids`	`Tensor`	Condition text position IDs.	required
`labels`	`Tensor`	Labels (unused). Defaults to None.	`None`

Returns:

Type	Description
	torch.Tensor: Scaled similarity scores.

Source code in src/unitorch/models/kolors/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    position_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    condition_input_ids: torch.Tensor,
    condition_attention_mask: torch.Tensor,
    condition_position_ids: torch.Tensor,
    labels: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the KolorsMPSModel.

    Args:
        input_ids (torch.Tensor): Text token IDs.
        attention_mask (torch.Tensor): Text attention mask.
        position_ids (torch.Tensor): Text position IDs.
        pixel_values (torch.Tensor): Image pixel values.
        condition_input_ids (torch.Tensor): Condition text token IDs.
        condition_attention_mask (torch.Tensor): Condition text attention mask.
        condition_position_ids (torch.Tensor): Condition text position IDs.
        labels (torch.Tensor, optional): Labels (unused). Defaults to None.

    Returns:
        torch.Tensor: Scaled similarity scores.
    """
    text_outputs = self.model.text_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )
    text_features = self.model.text_projection(text_outputs[0])
    text_pooled_features = self.model.text_projection(text_outputs[1])

    image_outputs = self.model.vision_model(pixel_values=pixel_values)
    image_features = self.model.visual_projection(image_outputs[0])

    condition_outputs = self.model.text_model(
        input_ids=condition_input_ids,
        attention_mask=condition_attention_mask,
        position_ids=condition_position_ids,
    )
    condition_features = self.model.text_projection(condition_outputs[0])

    sim_text_condition = torch.einsum(
        "b i d, b j d -> b j i", text_features, condition_features
    )
    sim_text_condition = torch.max(sim_text_condition, dim=1, keepdim=True)[0]
    sim_text_condition = sim_text_condition / sim_text_condition.max()
    mask = torch.where(sim_text_condition > 0.01, 0, float("-inf"))
    mask = mask.repeat(1, image_features.shape[1], 1)

    cross_features = self.cross_model(image_features, text_features, mask)[:, 0]
    text_embeds = text_pooled_features / text_pooled_features.norm(
        dim=-1, keepdim=True
    )
    cross_embeds = cross_features / cross_features.norm(dim=-1, keepdim=True)

    scores = torch.sum(text_embeds * cross_embeds, dim=-1, keepdim=True)
    return self.model.logit_scale.exp() * scores

unitorch.models.kolors¤

KolorsMPSProcessor¤

text_classification ¤

image_classification ¤

classification ¤

KolorsMPSModel¤

config instance-attribute ¤

model instance-attribute ¤

cross_model instance-attribute ¤

forward ¤

config `instance-attribute` ¤

model `instance-attribute` ¤

cross_model `instance-attribute` ¤