unitorch.models.peft¤

BloomLoraForClassification¤

Bases: GenericPeftModel

Source code in src/unitorch/models/peft/modeling_bloom.py

def __init__(
    self,
    config_path: str,
    lora_r: Optional[int] = 16,
    lora_alpha: Optional[int] = 32,
    lora_dropout: Optional[float] = 0.05,
    fan_in_fan_out: Optional[bool] = True,
    target_modules: Optional[Union[List[str], str]] = ["query_key_value"],
    num_classes: Optional[int] = 1,
    hidden_dropout_prob: Optional[float] = 0.1,
    freeze_classifer: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = BloomConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        fan_in_fan_out=fan_in_fan_out,
        target_modules=target_modules,
    )
    self.peft_model = PeftModelForSequenceClassification(
        BloomModel(self.config), self.peft_config
    )
    self.dropout = nn.Dropout(hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    if freeze_classifer:
        for param in self.classifier.parameters():
            param.requires_grad = False
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the classification model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length).	required
`attention_mask`	`Tensor`	Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.	`None`
`position_ids`	`Tensor`	Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.	`None`

Returns:

Type	Description
	torch Output logits.Tensor: tensor of shape (batch_size, num_classes).

Source code in src/unitorch/models/peft/modeling_bloom.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the classification model.

    Args:
        input_ids (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
        attention_mask (torch.Tensor, optional): Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.
        position_ids (torch.Tensor, optional): Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.

    Returns:
        torch Output logits.Tensor: tensor of shape (batch_size, num_classes).
    """
    outputs = self.peft_model(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )[0]
    pooled_output = outputs[:, -1]
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

BloomLoraForGeneration¤

Bases: GenericPeftModel

Bloom Loar model for text generation tasks.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the model configuration file.	required
`gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/peft/modeling_bloom.py

def __init__(
    self,
    config_path: str,
    lora_r: Optional[int] = 16,
    lora_alpha: Optional[int] = 32,
    lora_dropout: Optional[float] = 0.05,
    fan_in_fan_out: Optional[bool] = True,
    target_modules: Optional[Union[List[str], str]] = ["query_key_value"],
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Bloom Loar model for text generation tasks.

    Args:
        config_path (str): Path to the model configuration file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = BloomConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        fan_in_fan_out=fan_in_fan_out,
        target_modules=target_modules,
    )
    self.peft_model = PeftModelForCausalLM(
        BloomForCausalLM(self.config), self.peft_config
    )
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the generation model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length). Defaults to None.	required
`attention_mask`	`Tensor`	Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.	`None`
`position_ids`	`Tensor`	Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.	`None`

Returns:

Type	Description
	torch Output logits.Tensor: tensor of shape (batch_size, sequence_length, vocab_size).

Source code in src/unitorch/models/peft/modeling_bloom.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the generation model.

    Args:
        input_ids (torch.Tensor, optional): Input tensor of shape (batch_size, sequence_length). Defaults to None.
        attention_mask (torch.Tensor, optional): Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.
        position_ids (torch.Tensor, optional): Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.

    Returns:
        torch Output logits.Tensor: tensor of shape (batch_size, sequence_length, vocab_size).
    """
    outputs = self.peft_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        return_dict=True,
    )
    logits = outputs.logits
    return logits

generate ¤

generate(
    input_ids: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 2,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Generate text using the generation model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length).	required
`num_beams`	`int`	Number of beams for beam search. Defaults to 5.	`5`
`decoder_start_token_id`	`int`	The ID of the decoder start token. Defaults to 2.	`1`
`decoder_end_token_id`	`int or List[int]`	The ID(s) of the decoder end token(s). Defaults to 2.	`2`
`num_return_sequences`	`int`	Number of generated sequences to return. Defaults to 1.	`1`
`min_gen_seq_length`	`int`	Minimum length of generated sequences. Defaults to 0.	`0`
`max_gen_seq_length`	`int`	Maximum length of generated sequences. Defaults to 48.	`48`
`repetition_penalty`	`float`	Penalty for repeated tokens. Defaults to 1.0.	`1.0`
`no_repeat_ngram_size`	`int`	Size of n-grams to avoid repeating. Defaults to 0.	`0`
`early_stopping`	`bool`	Whether to stop generation early. Defaults to True.	`True`
`length_penalty`	`float`	Penalty for longer sequences. Defaults to 1.0.	`1.0`
`num_beam_groups`	`int`	Number of beam groups for diverse beam search. Defaults to 1.	`1`
`diversity_penalty`	`float`	Penalty for diverse sequences in diverse beam search. Defaults to 0.0.	`0.0`
`do_sample`	`bool`	Whether to use sampling for generation. Defaults to False.	`False`
`temperature`	`float`	Sampling temperature. Defaults to 1.0.	`1.0`
`top_k`	`int`	Top-k value for sampling. Defaults to 50.	`50`
`top_p`	`float`	Top-p value for sampling. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`GenericOutputs`		Generated sequences and their scores.

Source code in src/unitorch/models/peft/modeling_bloom.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 2,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    """
    Generate text using the generation model.

    Args:
        input_ids: Input tensor of shape (batch_size, sequence_length).
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): The ID of the decoder start token. Defaults to 2.
        decoder_end_token_id (int or List[int], optional): The ID(s) of the decoder end token(s). Defaults to 2.
        num_return_sequences (int, optional): Number of generated sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum length of generated sequences. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum length of generated sequences. Defaults to 48.
        repetition_penalty (float, optional): Penalty for repeated tokens. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): Size of n-grams to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop generation early. Defaults to True.
        length_penalty (float, optional): Penalty for longer sequences. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups for diverse beam search. Defaults to 1.
        diversity_penalty (float, optional): Penalty for diverse sequences in diverse beam search. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling for generation. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k value for sampling. Defaults to 50.
        top_p (float, optional): Top-p value for sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    outputs = self.peft_model.generate(
        input_ids=input_ids,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        decoder_start_token_id=decoder_start_token_id,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    outputs.sequences = torch.zeros(
        sequences.size(0), num_return_sequences, max_gen_seq_length
    ).to(device=sequences.device)
    outputs.sequences[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        outputs.sequences = outputs.sequences.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=outputs.sequences.long(),
        sequences_scores=outputs.sequences_scores,
    )

LlamaLoraForClassification¤

Bases: GenericPeftModel

Source code in src/unitorch/models/peft/modeling_llama.py

def __init__(
    self,
    config_path: str,
    quant_config_path: Optional[str] = None,
    lora_r: Optional[int] = 16,
    lora_alpha: Optional[int] = 32,
    lora_dropout: Optional[float] = 0.05,
    fan_in_fan_out: Optional[bool] = True,
    target_modules: Optional[Union[List[str], str]] = ["q_proj", "v_proj"],
    num_classes: Optional[int] = 1,
    hidden_dropout_prob: Optional[float] = 0.1,
    freeze_classifer: Optional[bool] = True,
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = LlamaConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        fan_in_fan_out=fan_in_fan_out,
        target_modules=target_modules,
    )
    model = LlamaModel(self.config)
    if quant_config_path is not None:
        quant_config = QuantizationConfig.from_json_file(quant_config_path)
        ignore_modules = target_modules + ["lm_head"]
        model = quantize_model(model, quant_config, ignore_modules=ignore_modules)
    self.peft_model = PeftModelForSequenceClassification(model, self.peft_config)
    self.dropout = nn.Dropout(hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_classes)
    if freeze_classifer:
        for param in self.classifier.parameters():
            param.requires_grad = False
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the classification model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length).	required
`attention_mask`	`Tensor`	Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.	`None`
`position_ids`	`Tensor`	Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.	`None`

Returns:

Type	Description
	torch Output logits.Tensor: tensor of shape (batch_size, num_classes).

Source code in src/unitorch/models/peft/modeling_llama.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the classification model.

    Args:
        input_ids (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
        attention_mask (torch.Tensor, optional): Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.
        position_ids (torch.Tensor, optional): Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.

    Returns:
        torch Output logits.Tensor: tensor of shape (batch_size, num_classes).
    """
    outputs = self.peft_model(
        input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
    )[0]
    pooled_output = outputs[:, -1]
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    return logits

LlamaLoraForGeneration¤

Bases: GenericPeftModel

Source code in src/unitorch/models/peft/modeling_llama.py

def __init__(
    self,
    config_path: str,
    quant_config_path: Optional[str] = None,
    lora_r: Optional[int] = 16,
    lora_alpha: Optional[int] = 32,
    lora_dropout: Optional[float] = 0.05,
    fan_in_fan_out: Optional[bool] = True,
    target_modules: Optional[Union[List[str], str]] = ["q_proj", "v_proj"],
    gradient_checkpointing: Optional[bool] = False,
):
    super().__init__()
    self.config = LlamaConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        fan_in_fan_out=fan_in_fan_out,
        target_modules=target_modules,
    )
    model = LlamaForCausalLM(self.config)
    if quant_config_path is not None:
        quant_config = QuantizationConfig.from_json_file(quant_config_path)
        ignore_modules = target_modules + ["lm_head"]
        model = quantize_model(model, quant_config, ignore_modules=ignore_modules)
    self.peft_model = PeftModelForCausalLM(model, self.peft_config)
    self.init_weights()

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    position_ids: Optional[Tensor] = None,
)

Forward pass of the generation model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length). Defaults to None.	required
`attention_mask`	`Tensor`	Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.	`None`
`position_ids`	`Tensor`	Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.	`None`

Returns:

Type	Description
	torch Output logits.Tensor: tensor of shape (batch_size, sequence_length, vocab_size).

Source code in src/unitorch/models/peft/modeling_llama.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the generation model.

    Args:
        input_ids (torch.Tensor, optional): Input tensor of shape (batch_size, sequence_length). Defaults to None.
        attention_mask (torch.Tensor, optional): Attention mask tensor of shape (batch_size, sequence_length). Defaults to None.
        position_ids (torch.Tensor, optional): Position IDs tensor of shape (batch_size, sequence_length). Defaults to None.

    Returns:
        torch Output logits.Tensor: tensor of shape (batch_size, sequence_length, vocab_size).
    """
    outputs = self.peft_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        return_dict=True,
    )
    logits = outputs.logits
    return logits

generate ¤

generate(
    input_ids: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 2,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Generate text using the generation model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input tensor of shape (batch_size, sequence_length).	required
`num_beams`	`int`	Number of beams for beam search. Defaults to 5.	`5`
`decoder_start_token_id`	`int`	The ID of the decoder start token. Defaults to 2.	`1`
`decoder_end_token_id`	`int or List[int]`	The ID(s) of the decoder end token(s). Defaults to 2.	`2`
`num_return_sequences`	`int`	Number of generated sequences to return. Defaults to 1.	`1`
`min_gen_seq_length`	`int`	Minimum length of generated sequences. Defaults to 0.	`0`
`max_gen_seq_length`	`int`	Maximum length of generated sequences. Defaults to 48.	`48`
`repetition_penalty`	`float`	Penalty for repeated tokens. Defaults to 1.0.	`1.0`
`no_repeat_ngram_size`	`int`	Size of n-grams to avoid repeating. Defaults to 0.	`0`
`early_stopping`	`bool`	Whether to stop generation early. Defaults to True.	`True`
`length_penalty`	`float`	Penalty for longer sequences. Defaults to 1.0.	`1.0`
`num_beam_groups`	`int`	Number of beam groups for diverse beam search. Defaults to 1.	`1`
`diversity_penalty`	`float`	Penalty for diverse sequences in diverse beam search. Defaults to 0.0.	`0.0`
`do_sample`	`bool`	Whether to use sampling for generation. Defaults to False.	`False`
`temperature`	`float`	Sampling temperature. Defaults to 1.0.	`1.0`
`top_k`	`int`	Top-k value for sampling. Defaults to 50.	`50`
`top_p`	`float`	Top-p value for sampling. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`GenericOutputs`		Generated sequences and their scores.

Source code in src/unitorch/models/peft/modeling_llama.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 1,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 2,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 48,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    """
    Generate text using the generation model.

    Args:
        input_ids: Input tensor of shape (batch_size, sequence_length).
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): The ID of the decoder start token. Defaults to 2.
        decoder_end_token_id (int or List[int], optional): The ID(s) of the decoder end token(s). Defaults to 2.
        num_return_sequences (int, optional): Number of generated sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum length of generated sequences. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum length of generated sequences. Defaults to 48.
        repetition_penalty (float, optional): Penalty for repeated tokens. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): Size of n-grams to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop generation early. Defaults to True.
        length_penalty (float, optional): Penalty for longer sequences. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups for diverse beam search. Defaults to 1.
        diversity_penalty (float, optional): Penalty for diverse sequences in diverse beam search. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling for generation. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k value for sampling. Defaults to 50.
        top_p (float, optional): Top-p value for sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    outputs = self.peft_model.generate(
        input_ids=input_ids,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        decoder_start_token_id=decoder_start_token_id,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    outputs.sequences = torch.zeros(
        sequences.size(0), num_return_sequences, max_gen_seq_length
    ).to(device=sequences.device)
    outputs.sequences[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        outputs.sequences = outputs.sequences.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=outputs.sequences.long(),
        sequences_scores=outputs.sequences_scores,
    )