Skip to content

unitorch.models.bert¤

BertProcessor¤

Bases: HfTextClassificationProcessor

Initializes the BertProcessor.

Parameters:

Name Type Description Default
vocab_path str

The path to the vocabulary file.

required
max_seq_length Optional[int]

The maximum sequence length. Defaults to 128.

128
special_input_ids Optional[Dict]

Special input IDs mapping. Defaults to an empty dictionary.

dict()
do_lower_case Optional[bool]

Whether to perform lowercase tokenization. Defaults to True.

True
do_basic_tokenize Optional[bool]

Whether to perform basic tokenization. Defaults to True.

True
do_whole_word_mask Optional[bool]

Whether to perform whole word masking. Defaults to True.

True
masked_lm_prob Optional[float]

The probability of masking a token for pretraining. Defaults to 0.15.

0.15
max_predictions_per_seq Optional[int]

The maximum number of masked tokens per sequence for pretraining. Defaults to 20.

20
Source code in src/unitorch/models/bert/processing.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def __init__(
    self,
    vocab_path: str,
    max_seq_length: Optional[int] = 128,
    special_input_ids: Optional[Dict] = dict(),
    do_lower_case: Optional[bool] = True,
    do_basic_tokenize: Optional[bool] = True,
    do_whole_word_mask: Optional[bool] = True,
    masked_lm_prob: Optional[float] = 0.15,
    max_predictions_per_seq: Optional[int] = 20,
):
    """
    Initializes the BertProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        max_seq_length (Optional[int], optional): The maximum sequence length. Defaults to 128.
        special_input_ids (Optional[Dict], optional): Special input IDs mapping. Defaults to an empty dictionary.
        do_lower_case (Optional[bool], optional): Whether to perform lowercase tokenization. Defaults to True.
        do_basic_tokenize (Optional[bool], optional): Whether to perform basic tokenization. Defaults to True.
        do_whole_word_mask (Optional[bool], optional): Whether to perform whole word masking. Defaults to True.
        masked_lm_prob (Optional[float], optional): The probability of masking a token for pretraining. Defaults to 0.15.
        max_predictions_per_seq (Optional[int], optional): The maximum number of masked tokens per sequence for pretraining. Defaults to 20.
    """
    tokenizer = get_bert_tokenizer(
        vocab_path,
        do_lower_case=do_lower_case,
        do_basic_tokenize=do_basic_tokenize,
        special_input_ids=special_input_ids,
    )
    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
    )
    self.do_whole_word_mask = do_whole_word_mask
    self.masked_lm_prob = masked_lm_prob
    self.max_predictions_per_seq = max_predictions_per_seq
    self.vocab_words = list(self.tokenizer.vocab.keys())

pretrain ¤

pretrain(
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
)

The Bert pretrain processor on the given text and text pair.

Parameters:

Name Type Description Default
text str

The input text.

required
text_pair str

The input text pair.

required
nsp_label int

The next sentence prediction label.

required
max_seq_length Optional[int]

The maximum sequence length. Defaults to None.

None
masked_lm_prob Optional[float]

The probability of masking a token for pretraining. Defaults to None.

None
do_whole_word_mask Optional[bool]

Whether to perform whole word masking. Defaults to None.

None
max_predictions_per_seq Optional[int]

The maximum number of masked tokens per sequence for pretraining. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs

pretrain processing outputs.

Source code in src/unitorch/models/bert/processing.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def pretrain(
    self,
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
):
    """
    The Bert pretrain processor on the given text and text pair.

    Args:
        text (str): The input text.
        text_pair (str): The input text pair.
        nsp_label (int): The next sentence prediction label.
        max_seq_length (Optional[int], optional): The maximum sequence length. Defaults to None.
        masked_lm_prob (Optional[float], optional): The probability of masking a token for pretraining. Defaults to None.
        do_whole_word_mask (Optional[bool], optional): Whether to perform whole word masking. Defaults to None.
        max_predictions_per_seq (Optional[int], optional): The maximum number of masked tokens per sequence for pretraining. Defaults to None.

    Returns:
        GenericOutputs: pretrain processing outputs.
    """
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )

    masked_lm_prob = pop_value(
        masked_lm_prob,
        self.masked_lm_prob,
    )

    do_whole_word_mask = pop_value(
        do_whole_word_mask,
        self.do_whole_word_mask,
    )

    max_predictions_per_seq = pop_value(
        max_predictions_per_seq,
        self.max_predictions_per_seq,
    )

    _tokens = self.tokenizer.tokenize(str(text))
    tokens_pair = self.tokenizer.tokenize(str(text_pair))
    truncate_sequence_pair(_tokens, tokens_pair, max_seq_length - 3)
    tokens = (
        [self.cls_token]
        + _tokens
        + [self.sep_token]
        + tokens_pair
        + [self.sep_token]
    )

    covered_indexes = get_random_mask_indexes(
        tokens,
        masked_lm_prob,
        do_whole_word_mask,
        max_predictions_per_seq,
        special_tokens=[self.cls_token, self.sep_token],
    )
    label = [
        tokens[pos] if pos in covered_indexes else self.pad_token
        for pos in range(max_seq_length)
    ]
    label_mask = [
        1 if pos in covered_indexes else 0 for pos in range(max_seq_length)
    ]
    label = self.tokenizer.convert_tokens_to_ids(label)

    for index in covered_indexes:
        mask_token = None
        if random.random() < 0.8:
            mask_token = self.mask_token
        else:
            mask_token = (
                tokens[index]
                if random.random() < 0.5
                else get_random_word(self.vocab_words)
            )
        tokens[index] = mask_token

    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    token_type_ids = [0] + [0] * len(_tokens) + [0] + [1] * len(tokens_pair) + [1]
    attention_mask = [1] * len(input_ids)

    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += len(padding) * [self.pad_token_id]
    attention_mask += padding
    token_type_ids += len(padding) * [1]

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    assert len(token_type_ids) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        token_type_ids=torch.tensor(token_type_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        position_ids=torch.tensor(list(range(max_seq_length)), dtype=torch.long),
        nsp_label=torch.tensor(int(nsp_label), dtype=torch.long),
        mlm_label=torch.tensor(label, dtype=torch.long),
        mlm_label_mask=torch.tensor(label_mask, dtype=torch.long),
    )

BertForClassification¤

Bases: GenericModel

Initializes the BertForClassification model.

Parameters:

Name Type Description Default
config_path str

The path to the configuration file.

required
num_classes Optional[int]

The number of classes for classification. Defaults to 1.

1
gradient_checkpointing Optional[bool]

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/bert/modeling.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the BertForClassification model.

    Args:
        config_path (str): The path to the configuration file.
        num_classes (Optional[int], optional): The number of classes for classification. Defaults to 1.
        gradient_checkpointing (Optional[bool], optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = BertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.bert = BertModel(self.config)
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    token_type_ids: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the BertForClassification model.

Parameters:

Name Type Description Default
input_ids Tensor

The input tensor of token indices.

required
attention_mask torch.Tensor optional

The attention mask tensor. Defaults to None.

None
token_type_ids torch.Tensor optional

The token type IDs tensor. Defaults to None.

None
position_ids torch.Tensor optional

The position IDs tensor. Defaults to None.

None

Returns:

Type Description
Tensor

The logits of the model output.

Source code in src/unitorch/models/bert/modeling.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the BertForClassification model.

    Args:
        input_ids (torch.Tensor): The input tensor of token indices.
        attention_mask (torch.Tensor optional): The attention mask tensor. Defaults to None.
        token_type_ids (torch.Tensor optional): The token type IDs tensor. Defaults to None.
        position_ids (torch.Tensor optional): The position IDs tensor. Defaults to None.

    Returns:
        (torch.Tensor):The logits of the model output.
    """
    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits