unitorch.models.bert¤

BertProcessor¤

Bases: HfTextClassificationProcessor

Initializes the BertProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	The path to the vocabulary file.	required
`max_seq_length`	`Optional[int]`	The maximum sequence length. Defaults to 128.	`128`
`special_input_ids`	`Optional[Dict]`	Special input IDs mapping. Defaults to an empty dictionary.	`dict()`
`do_lower_case`	`Optional[bool]`	Whether to perform lowercase tokenization. Defaults to True.	`True`
`do_basic_tokenize`	`Optional[bool]`	Whether to perform basic tokenization. Defaults to True.	`True`
`do_whole_word_mask`	`Optional[bool]`	Whether to perform whole word masking. Defaults to True.	`True`
`masked_lm_prob`	`Optional[float]`	The probability of masking a token for pretraining. Defaults to 0.15.	`0.15`
`max_predictions_per_seq`	`Optional[int]`	The maximum number of masked tokens per sequence for pretraining. Defaults to 20.	`20`

Source code in src/unitorch/models/bert/processing.py

def __init__(
    self,
    vocab_path: str,
    max_seq_length: Optional[int] = 128,
    special_input_ids: Optional[Dict] = dict(),
    do_lower_case: Optional[bool] = True,
    do_basic_tokenize: Optional[bool] = True,
    do_whole_word_mask: Optional[bool] = True,
    masked_lm_prob: Optional[float] = 0.15,
    max_predictions_per_seq: Optional[int] = 20,
):
    """
    Initializes the BertProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        max_seq_length (Optional[int], optional): The maximum sequence length. Defaults to 128.
        special_input_ids (Optional[Dict], optional): Special input IDs mapping. Defaults to an empty dictionary.
        do_lower_case (Optional[bool], optional): Whether to perform lowercase tokenization. Defaults to True.
        do_basic_tokenize (Optional[bool], optional): Whether to perform basic tokenization. Defaults to True.
        do_whole_word_mask (Optional[bool], optional): Whether to perform whole word masking. Defaults to True.
        masked_lm_prob (Optional[float], optional): The probability of masking a token for pretraining. Defaults to 0.15.
        max_predictions_per_seq (Optional[int], optional): The maximum number of masked tokens per sequence for pretraining. Defaults to 20.
    """
    tokenizer = get_bert_tokenizer(
        vocab_path,
        do_lower_case=do_lower_case,
        do_basic_tokenize=do_basic_tokenize,
        special_input_ids=special_input_ids,
    )
    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
    )
    self.do_whole_word_mask = do_whole_word_mask
    self.masked_lm_prob = masked_lm_prob
    self.max_predictions_per_seq = max_predictions_per_seq
    self.vocab_words = list(self.tokenizer.vocab.keys())

pretrain ¤

pretrain(
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
)

The Bert pretrain processor on the given text and text pair.

Parameters:

Name	Type	Description	Default
`text`	`str`	The input text.	required
`text_pair`	`str`	The input text pair.	required
`nsp_label`	`int`	The next sentence prediction label.	required
`max_seq_length`	`Optional[int]`	The maximum sequence length. Defaults to None.	`None`
`masked_lm_prob`	`Optional[float]`	The probability of masking a token for pretraining. Defaults to None.	`None`
`do_whole_word_mask`	`Optional[bool]`	Whether to perform whole word masking. Defaults to None.	`None`
`max_predictions_per_seq`	`Optional[int]`	The maximum number of masked tokens per sequence for pretraining. Defaults to None.	`None`

Returns:

Name	Type	Description
`GenericOutputs`		pretrain processing outputs.

Source code in src/unitorch/models/bert/processing.py

def pretrain(
    self,
    text: str,
    text_pair: str,
    nsp_label: int,
    max_seq_length: Optional[int] = None,
    masked_lm_prob: Optional[float] = None,
    do_whole_word_mask: Optional[bool] = None,
    max_predictions_per_seq: Optional[int] = None,
):
    """
    The Bert pretrain processor on the given text and text pair.

    Args:
        text (str): The input text.
        text_pair (str): The input text pair.
        nsp_label (int): The next sentence prediction label.
        max_seq_length (Optional[int], optional): The maximum sequence length. Defaults to None.
        masked_lm_prob (Optional[float], optional): The probability of masking a token for pretraining. Defaults to None.
        do_whole_word_mask (Optional[bool], optional): Whether to perform whole word masking. Defaults to None.
        max_predictions_per_seq (Optional[int], optional): The maximum number of masked tokens per sequence for pretraining. Defaults to None.

    Returns:
        GenericOutputs: pretrain processing outputs.
    """
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )

    masked_lm_prob = pop_value(
        masked_lm_prob,
        self.masked_lm_prob,
    )

    do_whole_word_mask = pop_value(
        do_whole_word_mask,
        self.do_whole_word_mask,
    )

    max_predictions_per_seq = pop_value(
        max_predictions_per_seq,
        self.max_predictions_per_seq,
    )

    _tokens = self.tokenizer.tokenize(str(text))
    tokens_pair = self.tokenizer.tokenize(str(text_pair))
    truncate_sequence_pair(_tokens, tokens_pair, max_seq_length - 3)
    tokens = (
        [self.cls_token]
        + _tokens
        + [self.sep_token]
        + tokens_pair
        + [self.sep_token]
    )

    covered_indexes = get_random_mask_indexes(
        tokens,
        masked_lm_prob,
        do_whole_word_mask,
        max_predictions_per_seq,
        special_tokens=[self.cls_token, self.sep_token],
    )
    label = [
        tokens[pos] if pos in covered_indexes else self.pad_token
        for pos in range(max_seq_length)
    ]
    label_mask = [
        1 if pos in covered_indexes else 0 for pos in range(max_seq_length)
    ]
    label = self.tokenizer.convert_tokens_to_ids(label)

    for index in covered_indexes:
        mask_token = None
        if random.random() < 0.8:
            mask_token = self.mask_token
        else:
            mask_token = (
                tokens[index]
                if random.random() < 0.5
                else get_random_word(self.vocab_words)
            )
        tokens[index] = mask_token

    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    token_type_ids = [0] + [0] * len(_tokens) + [0] + [1] * len(tokens_pair) + [1]
    attention_mask = [1] * len(input_ids)

    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += len(padding) * [self.pad_token_id]
    attention_mask += padding
    token_type_ids += len(padding) * [1]

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    assert len(token_type_ids) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        token_type_ids=torch.tensor(token_type_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        position_ids=torch.tensor(list(range(max_seq_length)), dtype=torch.long),
        nsp_label=torch.tensor(int(nsp_label), dtype=torch.long),
        mlm_label=torch.tensor(label, dtype=torch.long),
        mlm_label_mask=torch.tensor(label_mask, dtype=torch.long),
    )

BertForClassification¤

Bases: GenericModel

Initializes the BertForClassification model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	The path to the configuration file.	required
`num_classes`	`Optional[int]`	The number of classes for classification. Defaults to 1.	`1`
`gradient_checkpointing`	`Optional[bool]`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/bert/modeling.py

def __init__(
    self,
    config_path: str,
    num_classes: Optional[int] = 1,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the BertForClassification model.

    Args:
        config_path (str): The path to the configuration file.
        num_classes (Optional[int], optional): The number of classes for classification. Defaults to 1.
        gradient_checkpointing (Optional[bool], optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = BertConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.bert = BertModel(self.config)
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    token_type_ids: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the BertForClassification model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	The input tensor of token indices.	required
`attention_mask`	`torch.Tensor optional`	The attention mask tensor. Defaults to None.	`None`
`token_type_ids`	`torch.Tensor optional`	The token type IDs tensor. Defaults to None.	`None`
`position_ids`	`torch.Tensor optional`	The position IDs tensor. Defaults to None.	`None`

Returns:

Type	Description
`Tensor`	The logits of the model output.

Source code in src/unitorch/models/bert/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the BertForClassification model.

    Args:
        input_ids (torch.Tensor): The input tensor of token indices.
        attention_mask (torch.Tensor optional): The attention mask tensor. Defaults to None.
        token_type_ids (torch.Tensor optional): The token type IDs tensor. Defaults to None.
        position_ids (torch.Tensor optional): The position IDs tensor. Defaults to None.

    Returns:
        (torch.Tensor):The logits of the model output.
    """
    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
    )
    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits