unitorch.models.visualbert¤

VisualBertProcessor¤

Bases: BertProcessor

Processor for VisualBERT-based models.

Initializes the VisualBertProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	Path to the vocabulary file.	required
`max_seq_length`	`Optional[int]`	Maximum sequence length. Defaults to 128.	`128`
`special_input_ids`	`Optional[Dict]`	Special input IDs. Defaults to an empty dictionary.	`dict()`
`do_lower_case`	`Optional[bool]`	Whether to convert text to lowercase. Defaults to True.	`True`
`do_basic_tokenize`	`Optional[bool]`	Whether to perform basic tokenization. Defaults to True.	`True`
`do_whole_word_mask`	`Optional[bool]`	Whether to use whole word masking. Defaults to True.	`True`
`masked_lm_prob`	`Optional[float]`	Probability for masked LM. Defaults to 0.15.	`0.15`
`max_predictions_per_seq`	`Optional[int]`	Maximum number of masked LM predictions per sequence. Defaults to 20.	`20`

Source code in src/unitorch/models/visualbert/processing.py

def __init__(
    self,
    vocab_path,
    max_seq_length: Optional[int] = 128,
    special_input_ids: Optional[Dict] = dict(),
    do_lower_case: Optional[bool] = True,
    do_basic_tokenize: Optional[bool] = True,
    do_whole_word_mask: Optional[bool] = True,
    masked_lm_prob: Optional[float] = 0.15,
    max_predictions_per_seq: Optional[int] = 20,
):
    """
    Initializes the VisualBertProcessor.

    Args:
        vocab_path (str): Path to the vocabulary file.
        max_seq_length (Optional[int]): Maximum sequence length. Defaults to 128.
        special_input_ids (Optional[Dict]): Special input IDs. Defaults to an empty dictionary.
        do_lower_case (Optional[bool]): Whether to convert text to lowercase. Defaults to True.
        do_basic_tokenize (Optional[bool]): Whether to perform basic tokenization. Defaults to True.
        do_whole_word_mask (Optional[bool]): Whether to use whole word masking. Defaults to True.
        masked_lm_prob (Optional[float]): Probability for masked LM. Defaults to 0.15.
        max_predictions_per_seq (Optional[int]): Maximum number of masked LM predictions per sequence. Defaults to 20.
    """
    super().__init__(
        vocab_path=vocab_path,
        max_seq_length=max_seq_length,
        do_lower_case=do_lower_case,
        do_basic_tokenize=do_basic_tokenize,
        do_whole_word_mask=do_whole_word_mask,
        masked_lm_prob=masked_lm_prob,
        max_predictions_per_seq=max_predictions_per_seq,
    )

VisualBertForClassification¤

Bases: GenericModel

VisualBERT model for classification tasks.

Initialize the VisualBertForClassification model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	The path to the VisualBERT model config file.	required
`num_classes`	`int`	The number of output classes for classification. Defaults to 1.	`1`
`gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/visualbert/modeling.py

def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initialize the VisualBertForClassification model.

    Args:
        config_path (str): The path to the VisualBERT model config file.
        num_classes (int, optional): The number of output classes for classification. Defaults to 1.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = VisualBertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.visual_bert = VisualBertModel(self.config)
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
    visual_embeds: Tensor,
    visual_attention_mask: Tensor,
    visual_token_type_ids: Tensor,
)

Forward pass of the VisualBertForClassification model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	The input token IDs.	required
`attention_mask`	`Tensor`	The attention mask.	required
`token_type_ids`	`Tensor`	The token type IDs.	required
`position_ids`	`Tensor`	The position IDs.	required
`visual_embeds`	`Tensor`	The visual embeddings.	required
`visual_attention_mask`	`Tensor`	The visual attention mask.	required
`visual_token_type_ids`	`Tensor`	The visual token type IDs.	required

Returns:

Type	Description
`Tensor`	The logits for classification.

Source code in src/unitorch/models/visualbert/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
    visual_embeds: torch.Tensor,
    visual_attention_mask: torch.Tensor,
    visual_token_type_ids: torch.Tensor,
):
    """
    Forward pass of the VisualBertForClassification model.

    Args:
        input_ids (torch.Tensor): The input token IDs.
        attention_mask (torch.Tensor): The attention mask.
        token_type_ids (torch.Tensor): The token type IDs.
        position_ids (torch.Tensor): The position IDs.
        visual_embeds (torch.Tensor): The visual embeddings.
        visual_attention_mask (torch.Tensor): The visual attention mask.
        visual_token_type_ids (torch.Tensor): The visual token type IDs.

    Returns:
        (torch.Tensor): The logits for classification.
    """
    outputs = self.visual_bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        visual_embeds=visual_embeds,
        visual_attention_mask=visual_attention_mask,
        visual_token_type_ids=visual_token_type_ids,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

VisualBertForPretrain¤

Bases: GenericModel

VisualBERT model for pretraining tasks.

Initialize the VisualBertForPretrain model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	The path to the VisualBERT model config file.	required
`gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/visualbert/modeling.py

def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initialize the VisualBertForPretrain model.

    Args:
        config_path (str): The path to the VisualBERT model config file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = VisualBertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.visual_bert = VisualBertModel(self.config)
    self.cls = VisualBertPreTrainingHeads(self.config)
    self.init_weights()

    self.mlm_loss_fn = nn.CrossEntropyLoss(reduction="none")
    self.nsp_loss_fn = nn.CrossEntropyLoss(reduction="none")

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
    visual_embeds: Tensor,
    visual_attention_mask: Tensor,
    visual_token_type_ids: Tensor,
    nsp_label: Tensor,
    mlm_label: Tensor,
    mlm_label_mask: Tensor,
)

Forward pass of the VisualBertForPretrain model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	The input token IDs.	required
`attention_mask`	`Tensor`	The attention mask.	required
`token_type_ids`	`Tensor`	The token type IDs.	required
`position_ids`	`Tensor`	The position IDs.	required
`visual_embeds`	`Tensor`	The visual embeddings.	required
`visual_attention_mask`	`Tensor`	The visual attention mask.	required
`visual_token_type_ids`	`Tensor`	The visual token type IDs.	required
`nsp_label`	`Tensor`	The next sentence prediction labels.	required
`mlm_label`	`Tensor`	The masked language modeling labels.	required
`mlm_label_mask`	`Tensor`	The masked language modeling label mask.	required

Returns:

Type	Description
`Tensor`	The loss of the model.

Source code in src/unitorch/models/visualbert/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
    visual_embeds: torch.Tensor,
    visual_attention_mask: torch.Tensor,
    visual_token_type_ids: torch.Tensor,
    nsp_label: torch.Tensor,
    mlm_label: torch.Tensor,
    mlm_label_mask: torch.Tensor,
):
    """
    Forward pass of the VisualBertForPretrain model.

    Args:
        input_ids (torch.Tensor): The input token IDs.
        attention_mask (torch.Tensor): The attention mask.
        token_type_ids (torch.Tensor): The token type IDs.
        position_ids (torch.Tensor): The position IDs.
        visual_embeds (torch.Tensor): The visual embeddings.
        visual_attention_mask (torch.Tensor): The visual attention mask.
        visual_token_type_ids (torch.Tensor): The visual token type IDs.
        nsp_label (torch.Tensor): The next sentence prediction labels.
        mlm_label (torch.Tensor): The masked language modeling labels.
        mlm_label_mask (torch.Tensor): The masked language modeling label mask.

    Returns:
        (torch.Tensor): The loss of the model.
    """
    outputs = self.visual_bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        visual_embeds=visual_embeds,
        visual_attention_mask=visual_attention_mask,
        visual_token_type_ids=visual_token_type_ids,
    )
    sequence_output, pooled_output = outputs[:2]
    prediction_scores, seq_relationship_score = self.cls(
        sequence_output, pooled_output
    )

    batch_size, seq_len, vocab_size = prediction_scores.size()
    masked_lm_loss = self.mlm_loss_fn(
        prediction_scores.view(-1, vocab_size), mlm_label.view(-1)
    ) * mlm_label_mask.view(-1)
    masked_lm_loss = masked_lm_loss.view(batch_size, seq_len).sum(1) / torch.max(
        mlm_label_mask.view(batch_size, seq_len).sum(1),
        torch.ones(batch_size).to(mlm_label_mask.device),
    )
    loss = masked_lm_loss.mean()

    loss += self.nsp_loss_fn(
        seq_relationship_score.view(-1, 2), nsp_label.view(-1)
    ).mean()

    return loss

get_output_embeddings ¤

get_output_embeddings()

Get the output embeddings of the model.

Returns:

Type	Description
`Module`	The output embeddings.

Source code in src/unitorch/models/visualbert/modeling.py

def get_output_embeddings(self):
    """
    Get the output embeddings of the model.

    Returns:
        (nn.Module): The output embeddings.
    """
    return self.cls.predictions.decoder

set_output_embeddings ¤

set_output_embeddings(new_embeddings)

Set the output embeddings of the model.

Parameters:

Name	Type	Description	Default
`new_embeddings`	`Module`	The new output embeddings.	required

Source code in src/unitorch/models/visualbert/modeling.py

def set_output_embeddings(self, new_embeddings):
    """
    Set the output embeddings of the model.

    Args:
        new_embeddings (nn.Module): The new output embeddings.
    """
    self.cls.predictions.decoder = new_embeddings