unitorch.models.llava¤

LlavaMistralClipProcessor¤

Bases: HfTextClassificationProcessor, HfTextGenerationProcessor, HfImageClassificationProcessor

Source code in src/unitorch/models/llava/processing.py

def __init__(
    self,
    tokenizer_file: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 128,
    max_gen_seq_length: Optional[int] = 48,
):
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )
    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }

    tokenizer = LlamaTokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {}
    for name, spec in special_tokens_map.items():
        special_tokens[name] = get_added_token(spec)
    tokenizer.add_special_tokens(special_tokens)

    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.cls_token_id = tokenizer.bos_token_id
    tokenizer.sep_token_id = tokenizer.eos_token_id
    tokenizer.pad_token_id = tokenizer.unk_token_id
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
    )
    HfTextGenerationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

    if vision_config_path is not None:
        vision_processor = CLIPImageProcessor.from_json_file(vision_config_path)
    else:
        vision_processor = CLIPImageProcessor.from_pretrained(
            "openai/clip-vit-base-patch32"
        )
    HfImageClassificationProcessor.__init__(
        self,
        vision_processor=vision_processor,
    )

chat_template ¤

chat_template(messages: List[Dict[str, Any]])

Source code in src/unitorch/models/llava/processing.py

def chat_template(
    self,
    messages: List[Dict[str, Any]],
):
    text = self.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return text

image_classification ¤

image_classification(image: Union[Image, str])

Source code in src/unitorch/models/llava/processing.py

def image_classification(
    self,
    image: Union[Image.Image, str],
):
    outputs = HfImageClassificationProcessor.classification(
        self,
        image=image,
    )

    return GenericOutputs(
        pixel_values=outputs.pixel_values,
    )

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    text_pair: Optional[str] = None,
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    text_pair: Optional[str] = None,
    max_seq_length: Optional[int] = None,
):
    assert "<image>" in text and not text.endswith("<image>")
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )

    tokens = self.tokenizer.tokenize(str(text))
    if text_pair is None:
        tokens = tokens[:max_seq_length]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    else:
        tokens_pair = self.tokenizer.tokenize(str(text_pair))
        truncate_sequence_pair(tokens, tokens_pair, max_seq_length)
        tokens = tokens + tokens_pair
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    padding = [0] * (max_seq_length - len(input_ids))
    attention_mask = [0] * len(padding) + [1] * len(input_ids)
    input_ids = len(padding) * [self.pad_token_id] + input_ids

    pixel_outputs = self.image_classification(
        image=image,
    )

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_outputs.pixel_values,
    )

generation_inputs ¤

generation_inputs(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def generation_inputs(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    assert "<image>" in text and not text.endswith("<image>")
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    tokens = [self.bos_token] + self.tokenizer.tokenize(str(text))[
        1 - max_seq_length :
    ]
    padding = [self.pad_token] * (max_seq_length - len(tokens))
    attention_mask = [0] * len(padding) + [1] * len(tokens)
    tokens = padding + tokens
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    pixel_outputs = self.image_classification(
        image=image,
    )

    assert len(input_ids) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_outputs.pixel_values,
    )

generation_labels ¤

generation_labels(
    text: str, max_gen_seq_length: Optional[int] = None
)

Source code in src/unitorch/models/llava/processing.py

def generation_labels(
    self,
    text: str,
    max_gen_seq_length: Optional[int] = None,
):
    max_gen_seq_length = pop_value(
        max_gen_seq_length,
        self.max_gen_seq_length,
    )
    tokens = self.tokenizer.tokenize(str(text))[: max_gen_seq_length - 1] + [
        self.eos_token
    ]
    padding = [self.pad_token] * (max_gen_seq_length - len(tokens))
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(input_ids)

    padding = [0] * (max_gen_seq_length - len(input_ids))
    input_ids += [self.pad_token_id] * len(padding)
    attention_mask += padding

    assert len(input_ids) == max_gen_seq_length
    assert len(attention_mask) == max_gen_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
    )

generation ¤

generation(
    text: str,
    image: Union[Image, str],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def generation(
    self,
    text: str,
    image: Union[Image.Image, str],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
):
    assert "<image>" in text and not text.endswith("<image>")
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    max_gen_seq_length = pop_value(
        max_gen_seq_length,
        self.max_gen_seq_length,
    )

    tokens = [self.bos_token] + self.tokenizer.tokenize(str(text))[
        1 - max_seq_length :
    ]
    tokens_pair = self.tokenizer.tokenize(str(text_pair))[
        : max_gen_seq_length - 1
    ] + [self.eos_token]
    padding_a = [self.pad_token] * (max_seq_length - len(tokens))
    padding_b = [self.pad_token] * (max_gen_seq_length - len(tokens_pair))
    attention_mask = (
        [0] * len(padding_a)
        + [1] * (len(tokens) + len(tokens_pair))
        + [0] * len(padding_b)
    )
    _tokens = padding_a + tokens + tokens_pair + padding_b
    input_ids = self.tokenizer.convert_tokens_to_ids(_tokens)

    tokens_label = tokens_pair + [self.pad_token] * (
        max_gen_seq_length - len(tokens_pair) + 1
    )
    input_ids_label = self.tokenizer.convert_tokens_to_ids(tokens_label)
    input_ids_label = [0] * (max_seq_length - 1) + input_ids_label
    attention_mask_label = [1] * len(tokens_pair) + [0] * (
        max_gen_seq_length - len(tokens_pair) + 1
    )
    attention_mask_label = [0] * (max_seq_length - 1) + attention_mask_label

    pixel_outputs = self.image_classification(
        image=image,
    )

    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_outputs.pixel_values,
        input_ids_label=torch.tensor(input_ids_label, dtype=torch.long),
        attention_mask_label=torch.tensor(attention_mask_label, dtype=torch.long),
    )

messages_generation ¤

messages_generation(
    messages: List[Dict[str, Any]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Source code in src/unitorch/models/llava/processing.py

def messages_generation(
    self,
    messages: List[Dict[str, Any]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    while messages and messages[-1]["role"] != "assistant":
        messages.pop()

    text = self.chat_template(messages[:-1])
    text_pair = self.chat_template(messages[-1:])
    outputs = self.generation(
        text=text,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        input_ids_label=outputs.input_ids_label,
        attention_mask_label=outputs.attention_mask_label,
    )

LlavaLlamaSiglipProcessor¤

Bases: HfTextClassificationProcessor, HfTextGenerationProcessor

Initialize the LlamaProcessor.

Parameters:

Name	Type	Description	Default
`vocab_file`	`str`	Path to the vocabulary file.	required
`max_seq_length`	`int`	Maximum sequence length for text classification. Defaults to 128.	`128`
`max_gen_seq_length`	`int`	Maximum sequence length for text generation. Defaults to 48.	`48`

Source code in src/unitorch/models/llava/processing.py

def __init__(
    self,
    tokenizer_file: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    vision_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 128,
    max_gen_seq_length: Optional[int] = 48,
):
    """
    Initialize the LlamaProcessor.

    Args:
        vocab_file (str): Path to the vocabulary file.
        max_seq_length (int, optional): Maximum sequence length for text classification. Defaults to 128.
        max_gen_seq_length (int, optional): Maximum sequence length for text generation. Defaults to 48.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )
    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }
    tokenizer = LlamaTokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )

    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {}
    for name, spec in special_tokens_map.items():
        special_tokens[name] = get_added_token(spec)
    tokenizer.add_special_tokens(special_tokens)

    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    tokenizer.cls_token_id = tokenizer.bos_token_id
    tokenizer.sep_token_id = tokenizer.eos_token_id
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
    )
    HfTextGenerationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

    if vision_config_path is not None:
        self.vision_processor = SiglipImageProcessor.from_json_file(
            vision_config_path
        )
    else:
        self.vision_processor = None

vision_processor `instance-attribute` ¤

vision_processor = from_json_file(vision_config_path)

chat_template ¤

chat_template(messages: List[Dict[str, Any]])

Source code in src/unitorch/models/llava/processing.py

def chat_template(
    self,
    messages: List[Dict[str, Any]],
):
    text = self.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    return text

image_classification ¤

image_classification(image: Union[Image, str])

Source code in src/unitorch/models/llava/processing.py

def image_classification(
    self,
    image: Union[Image.Image, str],
):
    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]

    return GenericOutputs(
        pixel_values=pixel_values,
    )

classification ¤

classification(
    text: str,
    image: Union[Image, str],
    text_pair: Optional[str] = None,
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def classification(
    self,
    text: str,
    image: Union[Image.Image, str],
    text_pair: Optional[str] = None,
    max_seq_length: Optional[int] = None,
):
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )

    tokens = self.tokenizer.tokenize(str(text))
    if text_pair is None:
        tokens = tokens[:max_seq_length]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    else:
        tokens_pair = self.tokenizer.tokenize(str(text_pair))
        truncate_sequence_pair(tokens, tokens_pair, max_seq_length)
        tokens = tokens + tokens_pair
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    padding = [0] * (max_seq_length - len(input_ids))
    attention_mask = [0] * len(padding) + [1] * len(input_ids)
    input_ids = len(padding) * [self.pad_token_id] + input_ids

    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_values,
    )

generation_inputs ¤

generation_inputs(
    text: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def generation_inputs(
    self,
    text: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    tokens = [self.bos_token] + self.tokenizer.tokenize(str(text))[
        1 - max_seq_length :
    ]
    padding = [self.pad_token] * (max_seq_length - len(tokens))
    attention_mask = [0] * len(padding) + [1] * len(tokens)
    tokens = padding + tokens
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]

    assert len(input_ids) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_values,
    )

generation_labels ¤

generation_labels(
    text: str, max_gen_seq_length: Optional[int] = None
)

Source code in src/unitorch/models/llava/processing.py

def generation_labels(
    self,
    text: str,
    max_gen_seq_length: Optional[int] = None,
):
    max_gen_seq_length = pop_value(
        max_gen_seq_length,
        self.max_gen_seq_length,
    )
    tokens = self.tokenizer.tokenize(str(text))[: max_gen_seq_length - 1] + [
        self.eos_token
    ]
    padding = [self.pad_token] * (max_gen_seq_length - len(tokens))
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(input_ids)

    padding = [0] * (max_gen_seq_length - len(input_ids))
    input_ids += [self.pad_token_id] * len(padding)
    attention_mask += padding

    assert len(input_ids) == max_gen_seq_length
    assert len(attention_mask) == max_gen_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
    )

generation ¤

generation(
    text: str,
    image: Union[Image, str],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/llava/processing.py

def generation(
    self,
    text: str,
    image: Union[Image.Image, str],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
):
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    max_gen_seq_length = pop_value(
        max_gen_seq_length,
        self.max_gen_seq_length,
    )

    tokens = [self.bos_token] + self.tokenizer.tokenize(str(text))[
        1 - max_seq_length :
    ]
    tokens_pair = self.tokenizer.tokenize(str(text_pair))[
        : max_gen_seq_length - 1
    ] + [self.eos_token]
    padding_a = [self.pad_token] * (max_seq_length - len(tokens))
    padding_b = [self.pad_token] * (max_gen_seq_length - len(tokens_pair))
    attention_mask = (
        [0] * len(padding_a)
        + [1] * (len(tokens) + len(tokens_pair))
        + [0] * len(padding_b)
    )
    _tokens = padding_a + tokens + tokens_pair + padding_b
    input_ids = self.tokenizer.convert_tokens_to_ids(_tokens)

    tokens_label = tokens_pair + [self.pad_token] * (
        max_gen_seq_length - len(tokens_pair) + 1
    )
    input_ids_label = self.tokenizer.convert_tokens_to_ids(tokens_label)
    input_ids_label = [0] * (max_seq_length - 1) + input_ids_label
    attention_mask_label = [1] * len(tokens_pair) + [0] * (
        max_gen_seq_length - len(tokens_pair) + 1
    )
    attention_mask_label = [0] * (max_seq_length - 1) + attention_mask_label

    pixel_values = self.vision_processor.preprocess(
        image, return_tensors="pt"
    ).pixel_values[0]

    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        pixel_values=pixel_values,
        input_ids_label=torch.tensor(input_ids_label, dtype=torch.long),
        attention_mask_label=torch.tensor(attention_mask_label, dtype=torch.long),
    )

messages_generation ¤

messages_generation(
    messages: List[Dict[str, Any]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Source code in src/unitorch/models/llava/processing.py

def messages_generation(
    self,
    messages: List[Dict[str, Any]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    while messages and messages[-1]["role"] != "assistant":
        messages.pop()

    text = self.chat_template(messages[:-1])
    text_pair = self.chat_template(messages[-1:])
    outputs = self.generation(
        text=text,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        input_ids_label=outputs.input_ids_label,
        attention_mask_label=outputs.attention_mask_label,
    )

LlavaMistralClipForClassification¤

Bases: GenericModel, PeftWeightLoaderMixin

Source code in src/unitorch/models/llava/modeling.py

def __init__(
    self,
    config_path: str,
    image_token_index: Optional[int] = 32000,
    num_classes: Optional[int] = 1,
    hidden_dropout_prob: Optional[float] = 0.1,
    freeze_vision_encoder: Optional[bool] = True,
    freeze_multi_modal_projector: Optional[bool] = True,
    freeze_llm_encoder: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = LlavaNextConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.vision_tower = CLIPVisionModel(self.config.vision_config)
    self.multi_modal_projector = LlavaNextMultiModalProjector(self.config)
    embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
    self.image_newline = nn.Parameter(
        torch.randn(self.config.text_config.hidden_size, dtype=self.dtype)
        * embed_std
    )
    self.language_model = MistralModel(self.config.text_config)
    self.dropout = nn.Dropout(hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.text_config.hidden_size, num_classes)
    self.init_weights()

    if freeze_vision_encoder:
        for param in self.vision_tower.parameters():
            param.requires_grad = False

    if freeze_multi_modal_projector:
        for param in self.multi_modal_projector.parameters():
            param.requires_grad = False

    if freeze_llm_encoder:
        for param in self.language_model.parameters():
            param.requires_grad = False

    self.image_token_index = image_token_index

replace_keys_in_state_dict `class-attribute` `instance-attribute` ¤

replace_keys_in_state_dict = {
    "language_model.model.": "language_model."
}

config `instance-attribute` ¤

config = from_json_file(config_path)

vision_tower `instance-attribute` ¤

vision_tower = CLIPVisionModel(vision_config)

multi_modal_projector `instance-attribute` ¤

multi_modal_projector = LlavaNextMultiModalProjector(config)

image_newline `instance-attribute` ¤

image_newline = Parameter(
    randn(hidden_size, dtype=dtype) * embed_std
)

language_model `instance-attribute` ¤

language_model = MistralModel(text_config)

dropout `instance-attribute` ¤

dropout = Dropout(hidden_dropout_prob)

classifier `instance-attribute` ¤

classifier = Linear(hidden_size, num_classes)

image_token_index `instance-attribute` ¤

image_token_index = image_token_index

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/models/llava/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    image_embeds = vision_outputs.hidden_states[-2][:, 1:]
    image_embeds = self.multi_modal_projector(image_embeds)
    image_embeds = torch.cat(
        [
            image_embeds,
            self.image_newline.expand(
                image_embeds.shape[0], 1, image_embeds.shape[-1]
            ),
        ],
        dim=1,
    )

    image_seq_length = image_embeds.size(1)
    batch_indices, text_indices = torch.where(input_ids != self.image_token_index)
    image_masks = (input_ids == self.image_token_index).long() * (
        image_seq_length - 1
    )
    new_positions = torch.cumsum(image_masks + 1, dim=1) - 1
    new_text_indices = new_positions[batch_indices, text_indices]

    input_ids[input_ids == self.image_token_index] = 0
    text_embeds = self.language_model.get_input_embeddings()(input_ids)

    batch_size, text_seq_length, text_dim = text_embeds.size()

    if attention_mask is None:
        attention_mask = torch.ones(batch_size, text_seq_length).to(
            text_embeds.device
        )

    final_embeds = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1, text_dim
    ).to(text_embeds.device)
    overwrite_masks = torch.ones(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(text_embeds.device)
    overwrite_masks[batch_indices, new_text_indices] = 0
    final_embeds[overwrite_masks == 0] = text_embeds[
        batch_indices, text_indices
    ].to(final_embeds)
    final_embeds[overwrite_masks == 1] = (
        image_embeds.contiguous().view(-1, text_dim).to(final_embeds)
    )
    final_masks = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(attention_mask)
    final_masks[overwrite_masks == 0] = attention_mask[
        batch_indices, text_indices
    ].to(final_masks)
    final_masks[overwrite_masks == 1] = 1
    position_ids = (final_masks.cumsum(dim=1) - 1).masked_fill(final_masks == 0, -1)

    outputs = self.language_model(
        inputs_embeds=final_embeds,
        attention_mask=final_masks,
        position_ids=position_ids,
    )[0]
    pooled_output = outputs[:, -1]
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

LlavaMistralClipForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Source code in src/unitorch/models/llava/modeling.py

def __init__(
    self,
    config_path: str,
    image_token_index: Optional[int] = 32000,
    freeze_vision_encoder: Optional[bool] = True,
    freeze_multi_modal_projector: Optional[bool] = False,
    freeze_llm_encoder: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = LlavaNextConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.vision_tower = CLIPVisionModel(self.config.vision_config)
    self.multi_modal_projector = LlavaNextMultiModalProjector(self.config)
    embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
    self.image_newline = nn.Parameter(
        torch.randn(self.config.text_config.hidden_size, dtype=self.dtype)
        * embed_std
    )
    self.language_model = MistralForCausalLM(self.config.text_config)
    self.init_weights()

    if freeze_vision_encoder:
        for param in self.vision_tower.parameters():
            param.requires_grad = False

    if freeze_multi_modal_projector:
        for param in self.multi_modal_projector.parameters():
            param.requires_grad = False

    if freeze_llm_encoder:
        for param in self.language_model.parameters():
            param.requires_grad = False

    self.image_token_index = image_token_index

config `instance-attribute` ¤

config = from_json_file(config_path)

vision_tower `instance-attribute` ¤

vision_tower = CLIPVisionModel(vision_config)

multi_modal_projector `instance-attribute` ¤

multi_modal_projector = LlavaNextMultiModalProjector(config)

image_newline `instance-attribute` ¤

image_newline = Parameter(
    randn(hidden_size, dtype=dtype) * embed_std
)

language_model `instance-attribute` ¤

language_model = MistralForCausalLM(text_config)

image_token_index `instance-attribute` ¤

image_token_index = image_token_index

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/models/llava/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    image_embeds = vision_outputs.hidden_states[-2][:, 1:]
    image_embeds = self.multi_modal_projector(image_embeds)
    image_embeds = torch.cat(
        [
            image_embeds,
            self.image_newline.expand(
                image_embeds.shape[0], 1, image_embeds.shape[-1]
            ),
        ],
        dim=1,
    )

    image_seq_length = image_embeds.size(1)
    batch_indices, text_indices = torch.where(input_ids != self.image_token_index)
    image_masks = (input_ids == self.image_token_index).long() * (
        image_seq_length - 1
    )
    new_positions = torch.cumsum(image_masks + 1, dim=1) - 1
    new_text_indices = new_positions[batch_indices, text_indices]

    input_ids[input_ids == self.image_token_index] = 0
    text_embeds = self.language_model.get_input_embeddings()(input_ids)

    batch_size, text_seq_length, text_dim = text_embeds.size()

    if attention_mask is None:
        attention_mask = torch.ones(batch_size, text_seq_length).to(
            text_embeds.device
        )

    final_embeds = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1, text_dim
    ).to(text_embeds.device)
    overwrite_masks = torch.ones(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(text_embeds.device)
    overwrite_masks[batch_indices, new_text_indices] = 0
    final_embeds[overwrite_masks == 0] = text_embeds[
        batch_indices, text_indices
    ].to(final_embeds)
    final_embeds[overwrite_masks == 1] = (
        image_embeds.contiguous().view(-1, text_dim).to(final_embeds)
    )
    final_masks = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(attention_mask)
    final_masks[overwrite_masks == 0] = attention_mask[
        batch_indices, text_indices
    ].to(final_masks)
    final_masks[overwrite_masks == 1] = 1
    position_ids = (final_masks.cumsum(dim=1) - 1).masked_fill(final_masks == 0, -1)

    outputs = self.language_model(
        inputs_embeds=final_embeds,
        attention_mask=final_masks,
        position_ids=position_ids,
    )
    logits = torch.zeros(batch_size, text_seq_length, outputs.logits.size(-1)).to(
        outputs.logits.device
    )
    logits[batch_indices, text_indices] = outputs.logits[
        batch_indices, new_text_indices
    ]
    return logits

generate ¤

generate(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 2,
    decoder_pad_token_id: Optional[int] = 32001,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Source code in src/unitorch/models/llava/modeling.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 2,
    decoder_pad_token_id: Optional[int] = 32001,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    image_embeds = vision_outputs.hidden_states[-2][:, 1:]
    image_embeds = self.multi_modal_projector(image_embeds)
    image_embeds = torch.cat(
        [
            image_embeds,
            self.image_newline.expand(
                image_embeds.shape[0], 1, image_embeds.shape[-1]
            ),
        ],
        dim=1,
    )

    image_seq_length = image_embeds.size(1)
    batch_indices, text_indices = torch.where(input_ids != self.image_token_index)
    image_masks = (input_ids == self.image_token_index).long() * (
        image_seq_length - 1
    )
    new_positions = torch.cumsum(image_masks + 1, dim=1) - 1
    new_text_indices = new_positions[batch_indices, text_indices]

    input_ids[input_ids == self.image_token_index] = 0
    text_embeds = self.language_model.get_input_embeddings()(input_ids)

    batch_size, text_seq_length, text_dim = text_embeds.size()
    if attention_mask is None:
        attention_mask = torch.ones(batch_size, text_seq_length).to(
            text_embeds.device
        )

    final_embeds = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1, text_dim
    ).to(text_embeds.device)
    overwrite_masks = torch.ones(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(text_embeds.device)
    overwrite_masks[batch_indices, new_text_indices] = 0
    final_embeds[overwrite_masks == 0] = text_embeds[
        batch_indices, text_indices
    ].to(final_embeds)
    final_embeds[overwrite_masks == 1] = (
        image_embeds.contiguous().view(-1, text_dim).to(final_embeds)
    )
    final_masks = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(attention_mask)
    final_masks[overwrite_masks == 0] = attention_mask[
        batch_indices, text_indices
    ].to(final_masks)
    final_masks[overwrite_masks == 1] = 1
    input_seq_length = final_embeds.size(1)
    outputs = self.language_model.generate(
        inputs_embeds=final_embeds,
        attention_mask=final_masks,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    outputs.sequences = (
        torch.zeros(sequences.size(0), num_return_sequences, max_gen_seq_length).to(
            device=sequences.device
        )
        + decoder_start_token_id
    )
    outputs.sequences[:, :, : sequences.size(-1)].copy_(
        sequences[:, :, : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        outputs.sequences = outputs.sequences.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=outputs.sequences.long(),
        sequences_scores=outputs.sequences_scores,
    )

LlavaLlamaSiglipForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Source code in src/unitorch/models/llava/modeling.py

def __init__(
    self,
    config_path: str,
    image_token_index: Optional[int] = 128077,
    freeze_vision_encoder: Optional[bool] = True,
    freeze_multi_modal_projector: Optional[bool] = False,
    freeze_llm_encoder: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = LlavaConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.vision_tower = SiglipVisionModel(self.config.vision_config)
    self.multi_modal_projector = LlavaMultiModalProjector(self.config)
    self.language_model = LlamaForCausalLM(self.config.text_config)
    self.init_weights()

    if freeze_vision_encoder:
        for param in self.vision_tower.parameters():
            param.requires_grad = False

    if freeze_multi_modal_projector:
        for param in self.multi_modal_projector.parameters():
            param.requires_grad = False

    if freeze_llm_encoder:
        for param in self.language_model.parameters():
            param.requires_grad = False

    self.image_token_index = image_token_index

config `instance-attribute` ¤

config = from_json_file(config_path)

vision_tower `instance-attribute` ¤

vision_tower = SiglipVisionModel(vision_config)

multi_modal_projector `instance-attribute` ¤

multi_modal_projector = LlavaMultiModalProjector(config)

language_model `instance-attribute` ¤

language_model = LlamaForCausalLM(text_config)

image_token_index `instance-attribute` ¤

image_token_index = image_token_index

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/models/llava/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    image_embeds = vision_outputs.hidden_states[-2]
    image_embeds = self.multi_modal_projector(image_embeds)
    image_seq_length = image_embeds.size(1)
    batch_indices, text_indices = torch.where(input_ids != self.image_token_index)
    image_masks = (input_ids == self.image_token_index).long() * (
        image_seq_length - 1
    )
    new_positions = torch.cumsum(image_masks + 1, dim=1) - 1
    new_text_indices = new_positions[batch_indices, text_indices]

    input_ids[input_ids == self.image_token_index] = 0
    text_embeds = self.language_model.get_input_embeddings()(input_ids)

    batch_size, text_seq_length, text_dim = text_embeds.size()

    if attention_mask is None:
        attention_mask = torch.ones(batch_size, text_seq_length).to(
            text_embeds.device
        )

    final_embeds = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1, text_dim
    ).to(text_embeds.device)
    overwrite_masks = torch.ones(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(text_embeds.device)
    overwrite_masks[batch_indices, new_text_indices] = 0
    final_embeds[overwrite_masks == 0] = text_embeds[
        batch_indices, text_indices
    ].to(final_embeds)
    final_embeds[overwrite_masks == 1] = (
        image_embeds.contiguous().view(-1, text_dim).to(final_embeds)
    )
    final_masks = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(attention_mask)
    final_masks[overwrite_masks == 0] = attention_mask[
        batch_indices, text_indices
    ].to(final_masks)
    final_masks[overwrite_masks == 1] = 1
    position_ids = (final_masks.cumsum(dim=1) - 1).masked_fill(final_masks == 0, -1)

    outputs = self.language_model(
        inputs_embeds=final_embeds,
        attention_mask=final_masks,
        position_ids=position_ids,
    )
    logits = torch.zeros(batch_size, text_seq_length, outputs.logits.size(-1)).to(
        outputs.logits.device
    )
    logits[batch_indices, text_indices] = outputs.logits[
        batch_indices, new_text_indices
    ]
    return logits

generate ¤

generate(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 128000,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = [128001, 128008, 128009],
    decoder_pad_token_id: Optional[int] = 128004,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Source code in src/unitorch/models/llava/modeling.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 128000,
    decoder_end_token_id: Optional[Union[int, List[int]]] = [
        128001,
        128008,
        128009,
    ],
    decoder_pad_token_id: Optional[int] = 128004,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    vision_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
    image_embeds = vision_outputs.hidden_states[-2]
    image_embeds = self.multi_modal_projector(image_embeds)
    image_seq_length = image_embeds.size(1)
    batch_indices, text_indices = torch.where(input_ids != self.image_token_index)
    image_masks = (input_ids == self.image_token_index).long() * (
        image_seq_length - 1
    )
    new_positions = torch.cumsum(image_masks + 1, dim=1) - 1
    new_text_indices = new_positions[batch_indices, text_indices]

    input_ids[input_ids == self.image_token_index] = 0
    text_embeds = self.language_model.get_input_embeddings()(input_ids)

    batch_size, text_seq_length, text_dim = text_embeds.size()
    if attention_mask is None:
        attention_mask = torch.ones(batch_size, text_seq_length).to(
            text_embeds.device
        )

    final_embeds = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1, text_dim
    ).to(text_embeds.device)
    overwrite_masks = torch.ones(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(text_embeds.device)
    overwrite_masks[batch_indices, new_text_indices] = 0
    final_embeds[overwrite_masks == 0] = text_embeds[
        batch_indices, text_indices
    ].to(final_embeds)
    final_embeds[overwrite_masks == 1] = (
        image_embeds.contiguous().view(-1, text_dim).to(final_embeds)
    )
    final_masks = torch.zeros(
        batch_size, text_seq_length + image_seq_length - 1
    ).to(attention_mask)
    final_masks[overwrite_masks == 0] = attention_mask[
        batch_indices, text_indices
    ].to(final_masks)
    final_masks[overwrite_masks == 1] = 1
    input_seq_length = final_embeds.size(1)
    outputs = self.language_model.generate(
        inputs_embeds=final_embeds,
        attention_mask=final_masks,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    outputs.sequences = (
        torch.zeros(sequences.size(0), num_return_sequences, max_gen_seq_length).to(
            device=sequences.device
        )
        + decoder_start_token_id
    )
    outputs.sequences[:, :, : sequences.size(-1)].copy_(
        sequences[:, :, : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        outputs.sequences = outputs.sequences.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=outputs.sequences.long(),
        sequences_scores=outputs.sequences_scores,
    )

unitorch.models.llava¤

LlavaMistralClipProcessor¤

chat_template ¤

image_classification ¤

classification ¤

generation_inputs ¤

generation_labels ¤

generation ¤

messages_generation ¤

LlavaLlamaSiglipProcessor¤

vision_processor instance-attribute ¤

chat_template ¤

image_classification ¤

classification ¤

generation_inputs ¤

generation_labels ¤

generation ¤

messages_generation ¤

LlavaMistralClipForClassification¤

replace_keys_in_state_dict class-attribute instance-attribute ¤

config instance-attribute ¤

vision_tower instance-attribute ¤

multi_modal_projector instance-attribute ¤

image_newline instance-attribute ¤

language_model instance-attribute ¤

dropout instance-attribute ¤

classifier instance-attribute ¤

image_token_index instance-attribute ¤

forward ¤

LlavaMistralClipForGeneration¤

config instance-attribute ¤

vision_tower instance-attribute ¤

multi_modal_projector instance-attribute ¤

image_newline instance-attribute ¤

language_model instance-attribute ¤

image_token_index instance-attribute ¤

forward ¤

generate ¤

LlavaLlamaSiglipForGeneration¤

config instance-attribute ¤

vision_tower instance-attribute ¤

multi_modal_projector instance-attribute ¤

language_model instance-attribute ¤

image_token_index instance-attribute ¤

forward ¤

generate ¤

vision_processor `instance-attribute` ¤

replace_keys_in_state_dict `class-attribute` `instance-attribute` ¤

config `instance-attribute` ¤

vision_tower `instance-attribute` ¤

multi_modal_projector `instance-attribute` ¤

image_newline `instance-attribute` ¤

language_model `instance-attribute` ¤

dropout `instance-attribute` ¤

classifier `instance-attribute` ¤

image_token_index `instance-attribute` ¤

config `instance-attribute` ¤

vision_tower `instance-attribute` ¤

multi_modal_projector `instance-attribute` ¤

image_newline `instance-attribute` ¤

language_model `instance-attribute` ¤

image_token_index `instance-attribute` ¤

config `instance-attribute` ¤

vision_tower `instance-attribute` ¤

multi_modal_projector `instance-attribute` ¤

language_model `instance-attribute` ¤

image_token_index `instance-attribute` ¤