unitorch.models.qwen¤

QWenProcessor¤

Bases: HfLlmProcessor

Initializes the QWenProcessor.

Parameters:

Name	Type	Description	Default
`tokenizer_file`	`str`	Path to the tokenizer file.	required
`tokenizer_config`	`str`	Path to the tokenizer config JSON file.	`None`
`special_tokens_map`	`str`	Path to the special tokens map JSON file.	`None`
`chat_template`	`str`	Path to the chat template JSON file.	`None`
`max_seq_length`	`int`	Maximum sequence length. Defaults to 12800.	`12800`
`max_gen_seq_length`	`int`	Maximum generated sequence length. Defaults to 512.	`512`

Source code in src/unitorch/models/qwen/processing.py

def __init__(
    self,
    tokenizer_file: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    max_seq_length: Optional[int] = 12800,
    max_gen_seq_length: Optional[int] = 512,
):
    """
    Initializes the QWenProcessor.

    Args:
        tokenizer_file (str): Path to the tokenizer file.
        tokenizer_config (str, optional): Path to the tokenizer config JSON file.
        special_tokens_map (str, optional): Path to the special tokens map JSON file.
        chat_template (str, optional): Path to the chat template JSON file.
        max_seq_length (int, optional): Maximum sequence length. Defaults to 12800.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )

    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }

    tokenizer = Qwen2TokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {
        name: get_added_token(spec) for name, spec in special_tokens_map.items()
    }
    tokenizer.add_special_tokens(special_tokens)

    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]

    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

QWenVLProcessor¤

Bases: HfLlmProcessor

Initializes the ClipProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	The path to the vocabulary file.	required
`merge_path`	`str`	The path to the merge file.	required
`max_seq_length`	`int`	The maximum sequence length for text inputs. Defaults to 262144.	`1280`

Source code in src/unitorch/models/qwen/processing_vl.py

def __init__(
    self,
    tokenizer_file: str,
    vision_config_path: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    max_seq_length: Optional[int] = 1280,
    max_gen_seq_length: Optional[int] = 512,
):
    """
    Initializes the ClipProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        merge_path (str): The path to the merge file.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to 262144.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )
    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }
    tokenizer = Qwen2TokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {}
    for name, spec in special_tokens_map.items():
        if not isinstance(spec, dict or str):
            continue
        special_tokens[name] = get_added_token(spec)
    tokenizer.add_special_tokens(special_tokens)
    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    self.image_token = (
        "<|image_pad|>"
        if not hasattr(tokenizer, "image_token")
        else tokenizer.image_token
    )
    self.video_token = (
        "<|video_pad|>"
        if not hasattr(tokenizer, "video_token")
        else tokenizer.video_token
    )
    self.image_token_id = (
        tokenizer.image_token_id
        if getattr(tokenizer, "image_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.image_token)
    )
    self.video_token_id = (
        tokenizer.video_token_id
        if getattr(tokenizer, "video_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.video_token)
    )
    self.vision_processor = Qwen2VLImageProcessor.from_json_file(vision_config_path)

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

image_token `instance-attribute` ¤

image_token = (
    "<|image_pad|>"
    if not hasattr(tokenizer, "image_token")
    else image_token
)

video_token `instance-attribute` ¤

video_token = (
    "<|video_pad|>"
    if not hasattr(tokenizer, "video_token")
    else video_token
)

image_token_id `instance-attribute` ¤

image_token_id = (
    image_token_id
    if getattr(tokenizer, "image_token_id", None)
    else convert_tokens_to_ids(image_token)
)

video_token_id `instance-attribute` ¤

video_token_id = (
    video_token_id
    if getattr(tokenizer, "video_token_id", None)
    else convert_tokens_to_ids(video_token)
)

vision_processor `instance-attribute` ¤

vision_processor = from_json_file(vision_config_path)

processing_images ¤

processing_images(
    images: Union[Image, str, List[Image], List[str]],
)

Process images for classification.

Parameters:

Name	Type	Description	Default
`images`	`(Image, str, List[Image], List[str])`	Input image or list of images.	required

Returns:

Name	Type	Description
`GenericOutputs`		Processed outputs.

Source code in src/unitorch/models/qwen/processing_vl.py

def processing_images(
    self,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
):
    """
    Process images for classification.

    Args:
        images (Image.Image, str, List[Image.Image], List[str]): Input image or list of images.

    Returns:
        GenericOutputs: Processed outputs.
    """
    if isinstance(images, (Image.Image, str)):
        images = [images]
    images = [
        image if isinstance(image, Image.Image) else Image.open(image)
        for image in images
    ]
    outputs = self.vision_processor(images=images, return_tensors="pt")
    return outputs

classification ¤

classification(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/qwen/processing_vl.py

def classification(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
):
    image_inputs = self.processing_images(images)
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = str(text).replace("<|placeholder|>", self.image_token)
    text_inputs = super().classification(text, max_seq_length=max_seq_length)

    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=torch.tensor(image_grid_thw, dtype=torch.long),
        pixel_values=torch.tensor(image_inputs["pixel_values"]),
    )

generation_inputs ¤

generation_inputs(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/qwen/processing_vl.py

def generation_inputs(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
):
    image_inputs = self.processing_images(images) if images else None
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = str(text).replace("<|placeholder|>", self.image_token)
    text_inputs = super().classification(text, max_seq_length=max_seq_length)
    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=torch.tensor(image_grid_thw, dtype=torch.long),
        pixel_values=torch.tensor(image_inputs["pixel_values"]),
    )

generation ¤

generation(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/qwen/processing_vl.py

def generation(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
):
    text, text_pair = str(text), str(text_pair)
    image_inputs = self.processing_images(images) if images else None
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = text.replace("<|placeholder|>", self.image_token)

    text_inputs = super().generation(
        text,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=(
            torch.tensor(image_grid_thw, dtype=torch.long)
            if image_grid_thw is not None
            else None
        ),
        pixel_values=(
            torch.tensor(image_inputs["pixel_values"])
            if image_inputs is not None
            else None
        ),
        input_ids_label=torch.tensor(text_inputs.input_ids_label, dtype=torch.long),
        attention_mask_label=torch.tensor(
            text_inputs.attention_mask_label, dtype=torch.long
        ),
    )

messages_generation ¤

messages_generation(
    messages: List[Dict[str, Any]],
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Preprocesses messages for generation.

Parameters:

Name	Type	Description	Default
`messages`	`List[Dict[str, Any]]`	The list of messages to process.	required
`max_seq_length`	`Optional[int]`	The maximum sequence length. Defaults to None.	`None`

Returns:

Name	Type	Description
`GenericOutputs`	`GenericOutputs`	The processed input IDs tensor.

Source code in src/unitorch/models/qwen/processing_vl.py

def messages_generation(
    self,
    messages: List[Dict[str, Any]],
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """
    Preprocesses messages for generation.

    Args:
        messages (List[Dict[str, Any]]): The list of messages to process.
        max_seq_length (Optional[int]): The maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: The processed input IDs tensor.
    """
    while messages and messages[-1]["role"] != "assistant":
        messages.pop()

    text = self.chat_template(messages[:-1])
    text_pair = self.chat_template(messages[-1:])
    outputs = self.generation(
        text=text,
        images=images,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        image_grid_thw=outputs.image_grid_thw,
        pixel_values=outputs.pixel_values,
        input_ids_label=outputs.input_ids_label,
        attention_mask_label=outputs.attention_mask_label,
    )

QWen3ForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Qwen3 model for text generation tasks.

Initializes the QWen3ForGeneration model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the Qwen3 configuration file.	required
`gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/qwen/modeling.py

def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the QWen3ForGeneration model.

    Args:
        config_path (str): Path to the Qwen3 configuration file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = Qwen3Config.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.model = Qwen3ForCausalLM(self.config)
    self.init_weights()

prefix_keys_in_state_dict `class-attribute` `instance-attribute` ¤

prefix_keys_in_state_dict = {
    "^(?!model\\.model\\.).*": "model."
}

config `instance-attribute` ¤

config = from_json_file(config_path)

model `instance-attribute` ¤

model = Qwen3ForCausalLM(config)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
) -> Tensor

Forward pass of the QWen3ForGeneration model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input token IDs.	required
`attention_mask`	`Tensor`	Attention mask. Defaults to None.	`None`

Returns:

Type	Description
`Tensor`	torch.Tensor: Output logits.

Source code in src/unitorch/models/qwen/modeling.py

def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """
    Forward pass of the QWen3ForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.

    Returns:
        torch.Tensor: Output logits.
    """
    outputs = self.model(
        input_ids,
        attention_mask=attention_mask,
        return_dict=True,
    )
    return outputs.logits

generate ¤

generate(
    input_ids: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
) -> GenericOutputs

Generates sequences using the QWen3ForGeneration model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input token IDs.	required
`num_beams`	`int`	Number of beams for beam search. Defaults to 5.	`5`
`decoder_start_token_id`	`int`	Start token ID. Defaults to 151643.	`151643`
`decoder_end_token_id`	`int or List[int]`	End token ID. Defaults to 151645.	`151645`
`decoder_pad_token_id`	`int`	Pad token ID. Defaults to 151643.	`151643`
`num_return_sequences`	`int`	Number of sequences to return. Defaults to 1.	`1`
`min_gen_seq_length`	`int`	Minimum generated sequence length. Defaults to 0.	`0`
`max_gen_seq_length`	`int`	Maximum generated sequence length. Defaults to 512.	`512`
`repetition_penalty`	`float`	Repetition penalty. Defaults to 1.0.	`1.0`
`no_repeat_ngram_size`	`int`	N-gram size to avoid repeating. Defaults to 0.	`0`
`early_stopping`	`bool`	Whether to stop early. Defaults to True.	`True`
`length_penalty`	`float`	Length penalty. Defaults to 1.0.	`1.0`
`num_beam_groups`	`int`	Number of beam groups. Defaults to 1.	`1`
`diversity_penalty`	`float`	Diversity penalty. Defaults to 0.0.	`0.0`
`do_sample`	`bool`	Whether to use sampling. Defaults to False.	`False`
`temperature`	`float`	Sampling temperature. Defaults to 1.0.	`1.0`
`top_k`	`int`	Top-k sampling. Defaults to 50.	`50`
`top_p`	`float`	Top-p (nucleus) sampling. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`GenericOutputs`	`GenericOutputs`	Generated sequences and their scores.

Source code in src/unitorch/models/qwen/modeling.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
) -> GenericOutputs:
    """
    Generates sequences using the QWen3ForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): Start token ID. Defaults to 151643.
        decoder_end_token_id (int or List[int], optional): End token ID. Defaults to 151645.
        decoder_pad_token_id (int, optional): Pad token ID. Defaults to 151643.
        num_return_sequences (int, optional): Number of sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum generated sequence length. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
        repetition_penalty (float, optional): Repetition penalty. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): N-gram size to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop early. Defaults to True.
        length_penalty (float, optional): Length penalty. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups. Defaults to 1.
        diversity_penalty (float, optional): Diversity penalty. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k sampling. Defaults to 50.
        top_p (float, optional): Top-p (nucleus) sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    outputs = self.model.generate(
        input_ids,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    padded = torch.full(
        (sequences.size(0), num_return_sequences, max_gen_seq_length),
        fill_value=decoder_start_token_id,
        device=sequences.device,
    )
    padded[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        padded = padded.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=padded.long(),
        sequences_scores=outputs.sequences_scores,
    )

QWen3VLForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Qwen3-VL model for vision-language text generation tasks.

Initializes the QWen3VLForGeneration model.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the Qwen3-VL configuration file.	required
`gradient_checkpointing`	`bool`	Whether to use gradient checkpointing. Defaults to False.	`False`

Source code in src/unitorch/models/qwen/modeling_vl.py

def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the QWen3VLForGeneration model.

    Args:
        config_path (str): Path to the Qwen3-VL configuration file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = Qwen3VLConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.model = Qwen3VLForConditionalGeneration(self.config)
    self.init_weights()

prefix_keys_in_state_dict `class-attribute` `instance-attribute` ¤

prefix_keys_in_state_dict = {
    "^model.visual.": "model.",
    "^model(?!\\.model).": "model.",
}

config `instance-attribute` ¤

config = from_json_file(config_path)

model `instance-attribute` ¤

model = Qwen3VLForConditionalGeneration(config)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Forward pass of the QWen3VLForGeneration model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input token IDs.	required
`pixel_values`	`Tensor`	Image pixel values.	required
`image_grid_thw`	`Tensor`	Image grid temporal/height/width info.	required
`attention_mask`	`Tensor`	Attention mask. Defaults to None.	`None`

Returns:

Type	Description
	torch.Tensor: Output logits.

Source code in src/unitorch/models/qwen/modeling_vl.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    image_grid_thw: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the QWen3VLForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        pixel_values (torch.Tensor): Image pixel values.
        image_grid_thw (torch.Tensor): Image grid temporal/height/width info.
        attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.

    Returns:
        torch.Tensor: Output logits.
    """
    image_grid_thw = image_grid_thw.view(-1, image_grid_thw.size(-1))
    pixel_values = pixel_values.view(-1, pixel_values.size(-1))
    outputs = self.model(
        input_ids=input_ids,
        pixel_values=pixel_values,
        image_grid_thw=image_grid_thw,
        attention_mask=attention_mask,
    )
    return outputs.logits

generate ¤

generate(
    input_ids: Tensor,
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Generates sequences using the QWen3VLForGeneration model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input token IDs.	required
`pixel_values`	`Tensor`	Image pixel values.	required
`image_grid_thw`	`Tensor`	Image grid temporal/height/width info.	required
`num_beams`	`int`	Number of beams for beam search. Defaults to 5.	`5`
`decoder_start_token_id`	`int`	Start token ID. Defaults to 151643.	`151643`
`decoder_end_token_id`	`int or List[int]`	End token ID. Defaults to 151645.	`151645`
`decoder_pad_token_id`	`int`	Pad token ID. Defaults to 151643.	`151643`
`num_return_sequences`	`int`	Number of sequences to return. Defaults to 1.	`1`
`min_gen_seq_length`	`int`	Minimum generated sequence length. Defaults to 0.	`0`
`max_gen_seq_length`	`int`	Maximum generated sequence length. Defaults to 512.	`512`
`repetition_penalty`	`float`	Repetition penalty. Defaults to 1.0.	`1.0`
`no_repeat_ngram_size`	`int`	N-gram size to avoid repeating. Defaults to 0.	`0`
`early_stopping`	`bool`	Whether to stop early. Defaults to True.	`True`
`length_penalty`	`float`	Length penalty. Defaults to 1.0.	`1.0`
`num_beam_groups`	`int`	Number of beam groups. Defaults to 1.	`1`
`diversity_penalty`	`float`	Diversity penalty. Defaults to 0.0.	`0.0`
`do_sample`	`bool`	Whether to use sampling. Defaults to False.	`False`
`temperature`	`float`	Sampling temperature. Defaults to 1.0.	`1.0`
`top_k`	`int`	Top-k sampling. Defaults to 50.	`50`
`top_p`	`float`	Top-p (nucleus) sampling. Defaults to 1.0.	`1.0`

Returns:

Name	Type	Description
`GenericOutputs`		Generated sequences and their scores.

Source code in src/unitorch/models/qwen/modeling_vl.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    image_grid_thw: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    """
    Generates sequences using the QWen3VLForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        pixel_values (torch.Tensor): Image pixel values.
        image_grid_thw (torch.Tensor): Image grid temporal/height/width info.
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): Start token ID. Defaults to 151643.
        decoder_end_token_id (int or List[int], optional): End token ID. Defaults to 151645.
        decoder_pad_token_id (int, optional): Pad token ID. Defaults to 151643.
        num_return_sequences (int, optional): Number of sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum generated sequence length. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
        repetition_penalty (float, optional): Repetition penalty. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): N-gram size to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop early. Defaults to True.
        length_penalty (float, optional): Length penalty. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups. Defaults to 1.
        diversity_penalty (float, optional): Diversity penalty. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k sampling. Defaults to 50.
        top_p (float, optional): Top-p (nucleus) sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    image_grid_thw = image_grid_thw.view(-1, image_grid_thw.size(-1))
    pixel_values = pixel_values.view(-1, pixel_values.size(-1))

    outputs = self.model.generate(
        input_ids=input_ids,
        pixel_values=pixel_values,
        image_grid_thw=image_grid_thw,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    padded = torch.full(
        (sequences.size(0), num_return_sequences, max_gen_seq_length),
        fill_value=decoder_start_token_id,
        device=sequences.device,
    )
    padded[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        padded = padded.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=padded.long(),
        sequences_scores=outputs.sequences_scores,
    )

unitorch.models.qwen¤

QWenProcessor¤

QWenVLProcessor¤

image_token instance-attribute ¤

video_token instance-attribute ¤

image_token_id instance-attribute ¤

video_token_id instance-attribute ¤

vision_processor instance-attribute ¤

processing_images ¤

classification ¤

generation_inputs ¤

generation ¤

messages_generation ¤

QWen3ForGeneration¤

prefix_keys_in_state_dict class-attribute instance-attribute ¤

config instance-attribute ¤

model instance-attribute ¤

forward ¤

generate ¤

QWen3VLForGeneration¤

prefix_keys_in_state_dict class-attribute instance-attribute ¤

config instance-attribute ¤

model instance-attribute ¤

forward ¤

generate ¤

image_token `instance-attribute` ¤

video_token `instance-attribute` ¤

image_token_id `instance-attribute` ¤

video_token_id `instance-attribute` ¤

vision_processor `instance-attribute` ¤

prefix_keys_in_state_dict `class-attribute` `instance-attribute` ¤

config `instance-attribute` ¤

model `instance-attribute` ¤

prefix_keys_in_state_dict `class-attribute` `instance-attribute` ¤

config `instance-attribute` ¤

model `instance-attribute` ¤