unitorch.models.bert¤

BertProcessor¤

Bases: HfTextClassificationProcessor

Text processor for BERT models, including MLM pre-training support.

Source code in src/unitorch/models/bert/processing.py

def __init__(
    self,
    vocab_path: str,
    max_seq_length: int = 128,
    special_input_ids: Optional[Dict] = None,
    do_lower_case: bool = True,
    do_basic_tokenize: bool = True,
    do_whole_word_mask: bool = True,
    masked_lm_prob: float = 0.15,
    max_predictions_per_seq: int = 20,
) -> None:
    super().__init__(
        tokenizer=get_bert_tokenizer(
            vocab_path,
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            special_input_ids=special_input_ids,
        ),
        max_seq_length=max_seq_length,
    )
    self.do_whole_word_mask = do_whole_word_mask
    self.masked_lm_prob = masked_lm_prob
    self.max_predictions_per_seq = max_predictions_per_seq
    self.vocab_words = list(self.tokenizer.vocab.keys())

do_whole_word_mask `instance-attribute` ¤

do_whole_word_mask = do_whole_word_mask

masked_lm_prob `instance-attribute` ¤

masked_lm_prob = masked_lm_prob

max_predictions_per_seq `instance-attribute` ¤

max_predictions_per_seq = max_predictions_per_seq

vocab_words `instance-attribute` ¤

vocab_words = list(keys())

pretrain ¤

pretrain(
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
) -> GenericOutputs

Process a sentence pair for BERT pre-training (MLM + NSP).

Source code in src/unitorch/models/bert/processing.py

def pretrain(
    self,
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
) -> GenericOutputs:
    """Process a sentence pair for BERT pre-training (MLM + NSP)."""
    max_seq_length = pop_value(max_seq_length, self.max_seq_length)
    masked_lm_prob = pop_value(masked_lm_prob, self.masked_lm_prob)
    do_whole_word_mask = pop_value(do_whole_word_mask, self.do_whole_word_mask)
    max_predictions_per_seq = pop_value(
        max_predictions_per_seq, self.max_predictions_per_seq
    )

    tokens_a = self.tokenizer.tokenize(str(text))
    tokens_b = self.tokenizer.tokenize(str(text_pair))
    truncate_sequence_pair(tokens_a, tokens_b, max_seq_length - 3)
    tokens = (
        [self.cls_token] + tokens_a + [self.sep_token] + tokens_b + [self.sep_token]
    )

    covered_indexes = get_random_mask_indexes(
        tokens,
        masked_lm_prob=masked_lm_prob,
        do_whole_word_mask=do_whole_word_mask,
        max_predictions_per_seq=max_predictions_per_seq,
        special_tokens=[self.cls_token, self.sep_token],
    )

    mlm_label = [
        tokens[i] if i in covered_indexes else self.pad_token
        for i in range(max_seq_length)
    ]
    mlm_label_mask = [
        1 if i in covered_indexes else 0 for i in range(max_seq_length)
    ]
    mlm_label = self.tokenizer.convert_tokens_to_ids(mlm_label)

    for idx in covered_indexes:
        if random.random() < 0.8:
            tokens[idx] = self.mask_token
        elif random.random() < 0.5:
            pass  # keep original token
        else:
            tokens[idx] = get_random_word(self.vocab_words)

    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    token_type_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
    attention_mask = [1] * len(input_ids)

    pad_len = max_seq_length - len(input_ids)
    input_ids += [self.pad_token_id] * pad_len
    attention_mask += [0] * pad_len
    token_type_ids += [1] * pad_len

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    assert len(token_type_ids) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        token_type_ids=torch.tensor(token_type_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        position_ids=torch.arange(max_seq_length, dtype=torch.long),
        nsp_label=torch.tensor(int(nsp_label), dtype=torch.long),
        mlm_label=torch.tensor(mlm_label, dtype=torch.long),
        mlm_label_mask=torch.tensor(mlm_label_mask, dtype=torch.long),
    )

BertForClassification¤

Bases: GenericModel

BERT model for sequence classification.

Source code in src/unitorch/models/bert/modeling.py

def __init__(
    self,
    config_path: str,
    num_classes: int = 1,
    gradient_checkpointing: bool = False,
) -> None:
    super().__init__()
    self.config = BertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.bert = BertModel(self.config)
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    self.init_weights()

replace_keys_in_state_dict `class-attribute` `instance-attribute` ¤

replace_keys_in_state_dict = {
    "gamma": "weight",
    "beta": "bias",
}

config `instance-attribute` ¤

config = from_json_file(config_path)

bert `instance-attribute` ¤

bert = BertModel(config)

dropout `instance-attribute` ¤

dropout = Dropout(hidden_dropout_prob)

classifier `instance-attribute` ¤

classifier = Linear(hidden_size, num_classes)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    token_type_ids: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
) -> Tensor

Source code in src/unitorch/models/bert/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    pooled = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
    ).pooler_output
    return self.classifier(self.dropout(pooled))

unitorch.models.bert¤

BertProcessor¤

do_whole_word_mask instance-attribute ¤

masked_lm_prob instance-attribute ¤

max_predictions_per_seq instance-attribute ¤

vocab_words instance-attribute ¤