Skip to content

unitorch.models.visualbert¤

VisualBertProcessor¤

Bases: BertProcessor

Processor for VisualBERT-based models.

Initializes the VisualBertProcessor.

Parameters:

Name Type Description Default
vocab_path str

Path to the vocabulary file.

required
max_seq_length Optional[int]

Maximum sequence length. Defaults to 128.

128
special_input_ids Optional[Dict]

Special input IDs. Defaults to an empty dictionary.

dict()
do_lower_case Optional[bool]

Whether to convert text to lowercase. Defaults to True.

True
do_basic_tokenize Optional[bool]

Whether to perform basic tokenization. Defaults to True.

True
do_whole_word_mask Optional[bool]

Whether to use whole word masking. Defaults to True.

True
masked_lm_prob Optional[float]

Probability for masked LM. Defaults to 0.15.

0.15
max_predictions_per_seq Optional[int]

Maximum number of masked LM predictions per sequence. Defaults to 20.

20
Source code in src/unitorch/models/visualbert/processing.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    vocab_path,
    max_seq_length: Optional[int] = 128,
    special_input_ids: Optional[Dict] = dict(),
    do_lower_case: Optional[bool] = True,
    do_basic_tokenize: Optional[bool] = True,
    do_whole_word_mask: Optional[bool] = True,
    masked_lm_prob: Optional[float] = 0.15,
    max_predictions_per_seq: Optional[int] = 20,
):
    """
    Initializes the VisualBertProcessor.

    Args:
        vocab_path (str): Path to the vocabulary file.
        max_seq_length (Optional[int]): Maximum sequence length. Defaults to 128.
        special_input_ids (Optional[Dict]): Special input IDs. Defaults to an empty dictionary.
        do_lower_case (Optional[bool]): Whether to convert text to lowercase. Defaults to True.
        do_basic_tokenize (Optional[bool]): Whether to perform basic tokenization. Defaults to True.
        do_whole_word_mask (Optional[bool]): Whether to use whole word masking. Defaults to True.
        masked_lm_prob (Optional[float]): Probability for masked LM. Defaults to 0.15.
        max_predictions_per_seq (Optional[int]): Maximum number of masked LM predictions per sequence. Defaults to 20.
    """
    super().__init__(
        vocab_path=vocab_path,
        max_seq_length=max_seq_length,
        do_lower_case=do_lower_case,
        do_basic_tokenize=do_basic_tokenize,
        do_whole_word_mask=do_whole_word_mask,
        masked_lm_prob=masked_lm_prob,
        max_predictions_per_seq=max_predictions_per_seq,
    )

VisualBertForClassification¤

Bases: GenericModel

VisualBERT model for classification tasks.

Initialize the VisualBertForClassification model.

Parameters:

Name Type Description Default
config_path str

The path to the VisualBERT model config file.

required
num_classes int

The number of output classes for classification. Defaults to 1.

1
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/visualbert/modeling.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initialize the VisualBertForClassification model.

    Args:
        config_path (str): The path to the VisualBERT model config file.
        num_classes (int, optional): The number of output classes for classification. Defaults to 1.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = VisualBertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.visual_bert = VisualBertModel(self.config)
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
    visual_embeds: Tensor,
    visual_attention_mask: Tensor,
    visual_token_type_ids: Tensor,
)

Forward pass of the VisualBertForClassification model.

Parameters:

Name Type Description Default
input_ids Tensor

The input token IDs.

required
attention_mask Tensor

The attention mask.

required
token_type_ids Tensor

The token type IDs.

required
position_ids Tensor

The position IDs.

required
visual_embeds Tensor

The visual embeddings.

required
visual_attention_mask Tensor

The visual attention mask.

required
visual_token_type_ids Tensor

The visual token type IDs.

required

Returns:

Type Description
Tensor

The logits for classification.

Source code in src/unitorch/models/visualbert/modeling.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
    visual_embeds: torch.Tensor,
    visual_attention_mask: torch.Tensor,
    visual_token_type_ids: torch.Tensor,
):
    """
    Forward pass of the VisualBertForClassification model.

    Args:
        input_ids (torch.Tensor): The input token IDs.
        attention_mask (torch.Tensor): The attention mask.
        token_type_ids (torch.Tensor): The token type IDs.
        position_ids (torch.Tensor): The position IDs.
        visual_embeds (torch.Tensor): The visual embeddings.
        visual_attention_mask (torch.Tensor): The visual attention mask.
        visual_token_type_ids (torch.Tensor): The visual token type IDs.

    Returns:
        (torch.Tensor): The logits for classification.
    """
    outputs = self.visual_bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        visual_embeds=visual_embeds,
        visual_attention_mask=visual_attention_mask,
        visual_token_type_ids=visual_token_type_ids,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

VisualBertForPretrain¤

Bases: GenericModel

VisualBERT model for pretraining tasks.

Initialize the VisualBertForPretrain model.

Parameters:

Name Type Description Default
config_path str

The path to the VisualBERT model config file.

required
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/visualbert/modeling.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initialize the VisualBertForPretrain model.

    Args:
        config_path (str): The path to the VisualBERT model config file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = VisualBertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.visual_bert = VisualBertModel(self.config)
    self.cls = VisualBertPreTrainingHeads(self.config)
    self.init_weights()

    self.mlm_loss_fn = nn.CrossEntropyLoss(reduction="none")
    self.nsp_loss_fn = nn.CrossEntropyLoss(reduction="none")

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Tensor,
    token_type_ids: Tensor,
    position_ids: Tensor,
    visual_embeds: Tensor,
    visual_attention_mask: Tensor,
    visual_token_type_ids: Tensor,
    nsp_label: Tensor,
    mlm_label: Tensor,
    mlm_label_mask: Tensor,
)

Forward pass of the VisualBertForPretrain model.

Parameters:

Name Type Description Default
input_ids Tensor

The input token IDs.

required
attention_mask Tensor

The attention mask.

required
token_type_ids Tensor

The token type IDs.

required
position_ids Tensor

The position IDs.

required
visual_embeds Tensor

The visual embeddings.

required
visual_attention_mask Tensor

The visual attention mask.

required
visual_token_type_ids Tensor

The visual token type IDs.

required
nsp_label Tensor

The next sentence prediction labels.

required
mlm_label Tensor

The masked language modeling labels.

required
mlm_label_mask Tensor

The masked language modeling label mask.

required

Returns:

Type Description
Tensor

The loss of the model.

Source code in src/unitorch/models/visualbert/modeling.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    token_type_ids: torch.Tensor,
    position_ids: torch.Tensor,
    visual_embeds: torch.Tensor,
    visual_attention_mask: torch.Tensor,
    visual_token_type_ids: torch.Tensor,
    nsp_label: torch.Tensor,
    mlm_label: torch.Tensor,
    mlm_label_mask: torch.Tensor,
):
    """
    Forward pass of the VisualBertForPretrain model.

    Args:
        input_ids (torch.Tensor): The input token IDs.
        attention_mask (torch.Tensor): The attention mask.
        token_type_ids (torch.Tensor): The token type IDs.
        position_ids (torch.Tensor): The position IDs.
        visual_embeds (torch.Tensor): The visual embeddings.
        visual_attention_mask (torch.Tensor): The visual attention mask.
        visual_token_type_ids (torch.Tensor): The visual token type IDs.
        nsp_label (torch.Tensor): The next sentence prediction labels.
        mlm_label (torch.Tensor): The masked language modeling labels.
        mlm_label_mask (torch.Tensor): The masked language modeling label mask.

    Returns:
        (torch.Tensor): The loss of the model.
    """
    outputs = self.visual_bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        visual_embeds=visual_embeds,
        visual_attention_mask=visual_attention_mask,
        visual_token_type_ids=visual_token_type_ids,
    )
    sequence_output, pooled_output = outputs[:2]
    prediction_scores, seq_relationship_score = self.cls(
        sequence_output, pooled_output
    )

    batch_size, seq_len, vocab_size = prediction_scores.size()
    masked_lm_loss = self.mlm_loss_fn(
        prediction_scores.view(-1, vocab_size), mlm_label.view(-1)
    ) * mlm_label_mask.view(-1)
    masked_lm_loss = masked_lm_loss.view(batch_size, seq_len).sum(1) / torch.max(
        mlm_label_mask.view(batch_size, seq_len).sum(1),
        torch.ones(batch_size).to(mlm_label_mask.device),
    )
    loss = masked_lm_loss.mean()

    loss += self.nsp_loss_fn(
        seq_relationship_score.view(-1, 2), nsp_label.view(-1)
    ).mean()

    return loss

get_output_embeddings ¤

get_output_embeddings()

Get the output embeddings of the model.

Returns:

Type Description
Module

The output embeddings.

Source code in src/unitorch/models/visualbert/modeling.py
110
111
112
113
114
115
116
117
def get_output_embeddings(self):
    """
    Get the output embeddings of the model.

    Returns:
        (nn.Module): The output embeddings.
    """
    return self.cls.predictions.decoder

set_output_embeddings ¤

set_output_embeddings(new_embeddings)

Set the output embeddings of the model.

Parameters:

Name Type Description Default
new_embeddings Module

The new output embeddings.

required
Source code in src/unitorch/models/visualbert/modeling.py
119
120
121
122
123
124
125
126
def set_output_embeddings(self, new_embeddings):
    """
    Set the output embeddings of the model.

    Args:
        new_embeddings (nn.Module): The new output embeddings.
    """
    self.cls.predictions.decoder = new_embeddings