Skip to content

unitorch.models.qwen¤

QWenProcessor¤

Bases: HfLlmProcessor

Initializes the QWenProcessor.

Parameters:

Name Type Description Default
tokenizer_file str

Path to the tokenizer file.

required
tokenizer_config str

Path to the tokenizer config JSON file.

None
special_tokens_map str

Path to the special tokens map JSON file.

None
chat_template str

Path to the chat template JSON file.

None
max_seq_length int

Maximum sequence length. Defaults to 12800.

12800
max_gen_seq_length int

Maximum generated sequence length. Defaults to 512.

512
Source code in src/unitorch/models/qwen/processing.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def __init__(
    self,
    tokenizer_file: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    max_seq_length: Optional[int] = 12800,
    max_gen_seq_length: Optional[int] = 512,
):
    """
    Initializes the QWenProcessor.

    Args:
        tokenizer_file (str): Path to the tokenizer file.
        tokenizer_config (str, optional): Path to the tokenizer config JSON file.
        special_tokens_map (str, optional): Path to the special tokens map JSON file.
        chat_template (str, optional): Path to the chat template JSON file.
        max_seq_length (int, optional): Maximum sequence length. Defaults to 12800.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )

    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }

    tokenizer = Qwen2TokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {
        name: get_added_token(spec) for name, spec in special_tokens_map.items()
    }
    tokenizer.add_special_tokens(special_tokens)

    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]

    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

QWenVLProcessor¤

Bases: HfLlmProcessor

Initializes the ClipProcessor.

Parameters:

Name Type Description Default
vocab_path str

The path to the vocabulary file.

required
merge_path str

The path to the merge file.

required
max_seq_length int

The maximum sequence length for text inputs. Defaults to 262144.

1280
Source code in src/unitorch/models/qwen/processing_vl.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def __init__(
    self,
    tokenizer_file: str,
    vision_config_path: str,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    chat_template: Optional[str] = None,
    max_seq_length: Optional[int] = 1280,
    max_gen_seq_length: Optional[int] = 512,
):
    """
    Initializes the ClipProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        merge_path (str): The path to the merge file.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to 262144.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )
    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }
    tokenizer = Qwen2TokenizerFast(
        tokenizer_file=tokenizer_file,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {}
    for name, spec in special_tokens_map.items():
        if not isinstance(spec, dict or str):
            continue
        special_tokens[name] = get_added_token(spec)
    tokenizer.add_special_tokens(special_tokens)
    if chat_template:
        tokenizer.chat_template = read_json_file(chat_template)["chat_template"]
    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    self.image_token = (
        "<|image_pad|>"
        if not hasattr(tokenizer, "image_token")
        else tokenizer.image_token
    )
    self.video_token = (
        "<|video_pad|>"
        if not hasattr(tokenizer, "video_token")
        else tokenizer.video_token
    )
    self.image_token_id = (
        tokenizer.image_token_id
        if getattr(tokenizer, "image_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.image_token)
    )
    self.video_token_id = (
        tokenizer.video_token_id
        if getattr(tokenizer, "video_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.video_token)
    )
    self.vision_processor = Qwen2VLImageProcessor.from_json_file(vision_config_path)

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

image_token instance-attribute ¤

image_token = (
    "<|image_pad|>"
    if not hasattr(tokenizer, "image_token")
    else image_token
)

video_token instance-attribute ¤

video_token = (
    "<|video_pad|>"
    if not hasattr(tokenizer, "video_token")
    else video_token
)

image_token_id instance-attribute ¤

image_token_id = (
    image_token_id
    if getattr(tokenizer, "image_token_id", None)
    else convert_tokens_to_ids(image_token)
)

video_token_id instance-attribute ¤

video_token_id = (
    video_token_id
    if getattr(tokenizer, "video_token_id", None)
    else convert_tokens_to_ids(video_token)
)

vision_processor instance-attribute ¤

vision_processor = from_json_file(vision_config_path)

processing_images ¤

processing_images(
    images: Union[Image, str, List[Image], List[str]],
)

Process images for classification.

Parameters:

Name Type Description Default
images (Image, str, List[Image], List[str])

Input image or list of images.

required

Returns:

Name Type Description
GenericOutputs

Processed outputs.

Source code in src/unitorch/models/qwen/processing_vl.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def processing_images(
    self,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
):
    """
    Process images for classification.

    Args:
        images (Image.Image, str, List[Image.Image], List[str]): Input image or list of images.

    Returns:
        GenericOutputs: Processed outputs.
    """
    if isinstance(images, (Image.Image, str)):
        images = [images]
    images = [
        image if isinstance(image, Image.Image) else Image.open(image)
        for image in images
    ]
    outputs = self.vision_processor(images=images, return_tensors="pt")
    return outputs

classification ¤

classification(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
)
Source code in src/unitorch/models/qwen/processing_vl.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def classification(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
):
    image_inputs = self.processing_images(images)
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = str(text).replace("<|placeholder|>", self.image_token)
    text_inputs = super().classification(text, max_seq_length=max_seq_length)

    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=torch.tensor(image_grid_thw, dtype=torch.long),
        pixel_values=torch.tensor(image_inputs["pixel_values"]),
    )

generation_inputs ¤

generation_inputs(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
)
Source code in src/unitorch/models/qwen/processing_vl.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def generation_inputs(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
):
    image_inputs = self.processing_images(images) if images else None
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = str(text).replace("<|placeholder|>", self.image_token)
    text_inputs = super().classification(text, max_seq_length=max_seq_length)
    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=torch.tensor(image_grid_thw, dtype=torch.long),
        pixel_values=torch.tensor(image_inputs["pixel_values"]),
    )

generation ¤

generation(
    text: str,
    images: Union[Image, str, List[Image], List[str]],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
)
Source code in src/unitorch/models/qwen/processing_vl.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def generation(
    self,
    text: str,
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    text_pair: str,
    max_seq_length: Optional[int] = None,
    max_gen_seq_length: Optional[int] = None,
):
    text, text_pair = str(text), str(text_pair)
    image_inputs = self.processing_images(images) if images else None
    image_index, image_merge_size = 0, self.vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"] if image_inputs else None
    while self.image_token in text:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        text = text.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    text = text.replace("<|placeholder|>", self.image_token)

    text_inputs = super().generation(
        text,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
        max_gen_seq_length=max_gen_seq_length,
    )

    return GenericOutputs(
        input_ids=torch.tensor(text_inputs.input_ids, dtype=torch.long),
        attention_mask=torch.tensor(text_inputs.attention_mask, dtype=torch.long),
        image_grid_thw=(
            torch.tensor(image_grid_thw, dtype=torch.long)
            if image_grid_thw is not None
            else None
        ),
        pixel_values=(
            torch.tensor(image_inputs["pixel_values"])
            if image_inputs is not None
            else None
        ),
        input_ids_label=torch.tensor(text_inputs.input_ids_label, dtype=torch.long),
        attention_mask_label=torch.tensor(
            text_inputs.attention_mask_label, dtype=torch.long
        ),
    )

messages_generation ¤

messages_generation(
    messages: List[Dict[str, Any]],
    images: Union[Image, str, List[Image], List[str]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs

Preprocesses messages for generation.

Parameters:

Name Type Description Default
messages List[Dict[str, Any]]

The list of messages to process.

required
max_seq_length Optional[int]

The maximum sequence length. Defaults to None.

None

Returns:

Name Type Description
GenericOutputs GenericOutputs

The processed input IDs tensor.

Source code in src/unitorch/models/qwen/processing_vl.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def messages_generation(
    self,
    messages: List[Dict[str, Any]],
    images: Union[Image.Image, str, List[Image.Image], List[str]],
    max_seq_length: Optional[int] = None,
) -> GenericOutputs:
    """
    Preprocesses messages for generation.

    Args:
        messages (List[Dict[str, Any]]): The list of messages to process.
        max_seq_length (Optional[int]): The maximum sequence length. Defaults to None.

    Returns:
        GenericOutputs: The processed input IDs tensor.
    """
    while messages and messages[-1]["role"] != "assistant":
        messages.pop()

    text = self.chat_template(messages[:-1])
    text_pair = self.chat_template(messages[-1:])
    outputs = self.generation(
        text=text,
        images=images,
        text_pair=text_pair,
        max_seq_length=max_seq_length,
    )
    return GenericOutputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        image_grid_thw=outputs.image_grid_thw,
        pixel_values=outputs.pixel_values,
        input_ids_label=outputs.input_ids_label,
        attention_mask_label=outputs.attention_mask_label,
    )

QWen3ForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Qwen3 model for text generation tasks.

Initializes the QWen3ForGeneration model.

Parameters:

Name Type Description Default
config_path str

Path to the Qwen3 configuration file.

required
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/qwen/modeling.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the QWen3ForGeneration model.

    Args:
        config_path (str): Path to the Qwen3 configuration file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = Qwen3Config.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.model = Qwen3ForCausalLM(self.config)
    self.init_weights()

prefix_keys_in_state_dict class-attribute instance-attribute ¤

prefix_keys_in_state_dict = {
    "^(?!model\\.model\\.).*": "model."
}

config instance-attribute ¤

config = from_json_file(config_path)

model instance-attribute ¤

model = Qwen3ForCausalLM(config)

forward ¤

forward(
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
) -> Tensor

Forward pass of the QWen3ForGeneration model.

Parameters:

Name Type Description Default
input_ids Tensor

Input token IDs.

required
attention_mask Tensor

Attention mask. Defaults to None.

None

Returns:

Type Description
Tensor

torch.Tensor: Output logits.

Source code in src/unitorch/models/qwen/modeling.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def forward(
    self,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """
    Forward pass of the QWen3ForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.

    Returns:
        torch.Tensor: Output logits.
    """
    outputs = self.model(
        input_ids,
        attention_mask=attention_mask,
        return_dict=True,
    )
    return outputs.logits

generate ¤

generate(
    input_ids: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
) -> GenericOutputs

Generates sequences using the QWen3ForGeneration model.

Parameters:

Name Type Description Default
input_ids Tensor

Input token IDs.

required
num_beams int

Number of beams for beam search. Defaults to 5.

5
decoder_start_token_id int

Start token ID. Defaults to 151643.

151643
decoder_end_token_id int or List[int]

End token ID. Defaults to 151645.

151645
decoder_pad_token_id int

Pad token ID. Defaults to 151643.

151643
num_return_sequences int

Number of sequences to return. Defaults to 1.

1
min_gen_seq_length int

Minimum generated sequence length. Defaults to 0.

0
max_gen_seq_length int

Maximum generated sequence length. Defaults to 512.

512
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
no_repeat_ngram_size int

N-gram size to avoid repeating. Defaults to 0.

0
early_stopping bool

Whether to stop early. Defaults to True.

True
length_penalty float

Length penalty. Defaults to 1.0.

1.0
num_beam_groups int

Number of beam groups. Defaults to 1.

1
diversity_penalty float

Diversity penalty. Defaults to 0.0.

0.0
do_sample bool

Whether to use sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p (nucleus) sampling. Defaults to 1.0.

1.0

Returns:

Name Type Description
GenericOutputs GenericOutputs

Generated sequences and their scores.

Source code in src/unitorch/models/qwen/modeling.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
) -> GenericOutputs:
    """
    Generates sequences using the QWen3ForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): Start token ID. Defaults to 151643.
        decoder_end_token_id (int or List[int], optional): End token ID. Defaults to 151645.
        decoder_pad_token_id (int, optional): Pad token ID. Defaults to 151643.
        num_return_sequences (int, optional): Number of sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum generated sequence length. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
        repetition_penalty (float, optional): Repetition penalty. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): N-gram size to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop early. Defaults to True.
        length_penalty (float, optional): Length penalty. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups. Defaults to 1.
        diversity_penalty (float, optional): Diversity penalty. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k sampling. Defaults to 50.
        top_p (float, optional): Top-p (nucleus) sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    outputs = self.model.generate(
        input_ids,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    padded = torch.full(
        (sequences.size(0), num_return_sequences, max_gen_seq_length),
        fill_value=decoder_start_token_id,
        device=sequences.device,
    )
    padded[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        padded = padded.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=padded.long(),
        sequences_scores=outputs.sequences_scores,
    )

QWen3VLForGeneration¤

Bases: GenericModel, PeftWeightLoaderMixin

Qwen3-VL model for vision-language text generation tasks.

Initializes the QWen3VLForGeneration model.

Parameters:

Name Type Description Default
config_path str

Path to the Qwen3-VL configuration file.

required
gradient_checkpointing bool

Whether to use gradient checkpointing. Defaults to False.

False
Source code in src/unitorch/models/qwen/modeling_vl.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(
    self,
    config_path: str,
    gradient_checkpointing: Optional[bool] = False,
):
    """
    Initializes the QWen3VLForGeneration model.

    Args:
        config_path (str): Path to the Qwen3-VL configuration file.
        gradient_checkpointing (bool, optional): Whether to use gradient checkpointing. Defaults to False.
    """
    super().__init__()
    self.config = Qwen3VLConfig.from_json_file(config_path)
    self.config.gradient_checkpointing = gradient_checkpointing
    self.model = Qwen3VLForConditionalGeneration(self.config)
    self.init_weights()

prefix_keys_in_state_dict class-attribute instance-attribute ¤

prefix_keys_in_state_dict = {
    "^model.visual.": "model.",
    "^model(?!\\.model).": "model.",
}

config instance-attribute ¤

config = from_json_file(config_path)

model instance-attribute ¤

model = Qwen3VLForConditionalGeneration(config)

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Forward pass of the QWen3VLForGeneration model.

Parameters:

Name Type Description Default
input_ids Tensor

Input token IDs.

required
pixel_values Tensor

Image pixel values.

required
image_grid_thw Tensor

Image grid temporal/height/width info.

required
attention_mask Tensor

Attention mask. Defaults to None.

None

Returns:

Type Description

torch.Tensor: Output logits.

Source code in src/unitorch/models/qwen/modeling_vl.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    image_grid_thw: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the QWen3VLForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        pixel_values (torch.Tensor): Image pixel values.
        image_grid_thw (torch.Tensor): Image grid temporal/height/width info.
        attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.

    Returns:
        torch.Tensor: Output logits.
    """
    image_grid_thw = image_grid_thw.view(-1, image_grid_thw.size(-1))
    pixel_values = pixel_values.view(-1, pixel_values.size(-1))
    outputs = self.model(
        input_ids=input_ids,
        pixel_values=pixel_values,
        image_grid_thw=image_grid_thw,
        attention_mask=attention_mask,
    )
    return outputs.logits

generate ¤

generate(
    input_ids: Tensor,
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[
        Union[int, List[int]]
    ] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
)

Generates sequences using the QWen3VLForGeneration model.

Parameters:

Name Type Description Default
input_ids Tensor

Input token IDs.

required
pixel_values Tensor

Image pixel values.

required
image_grid_thw Tensor

Image grid temporal/height/width info.

required
num_beams int

Number of beams for beam search. Defaults to 5.

5
decoder_start_token_id int

Start token ID. Defaults to 151643.

151643
decoder_end_token_id int or List[int]

End token ID. Defaults to 151645.

151645
decoder_pad_token_id int

Pad token ID. Defaults to 151643.

151643
num_return_sequences int

Number of sequences to return. Defaults to 1.

1
min_gen_seq_length int

Minimum generated sequence length. Defaults to 0.

0
max_gen_seq_length int

Maximum generated sequence length. Defaults to 512.

512
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
no_repeat_ngram_size int

N-gram size to avoid repeating. Defaults to 0.

0
early_stopping bool

Whether to stop early. Defaults to True.

True
length_penalty float

Length penalty. Defaults to 1.0.

1.0
num_beam_groups int

Number of beam groups. Defaults to 1.

1
diversity_penalty float

Diversity penalty. Defaults to 0.0.

0.0
do_sample bool

Whether to use sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p (nucleus) sampling. Defaults to 1.0.

1.0

Returns:

Name Type Description
GenericOutputs

Generated sequences and their scores.

Source code in src/unitorch/models/qwen/modeling_vl.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    image_grid_thw: torch.Tensor,
    num_beams: Optional[int] = 5,
    decoder_start_token_id: Optional[int] = 151643,
    decoder_end_token_id: Optional[Union[int, List[int]]] = 151645,
    decoder_pad_token_id: Optional[int] = 151643,
    num_return_sequences: Optional[int] = 1,
    min_gen_seq_length: Optional[int] = 0,
    max_gen_seq_length: Optional[int] = 512,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 0,
    early_stopping: Optional[bool] = True,
    length_penalty: Optional[float] = 1.0,
    num_beam_groups: Optional[int] = 1,
    diversity_penalty: Optional[float] = 0.0,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
):
    """
    Generates sequences using the QWen3VLForGeneration model.

    Args:
        input_ids (torch.Tensor): Input token IDs.
        pixel_values (torch.Tensor): Image pixel values.
        image_grid_thw (torch.Tensor): Image grid temporal/height/width info.
        num_beams (int, optional): Number of beams for beam search. Defaults to 5.
        decoder_start_token_id (int, optional): Start token ID. Defaults to 151643.
        decoder_end_token_id (int or List[int], optional): End token ID. Defaults to 151645.
        decoder_pad_token_id (int, optional): Pad token ID. Defaults to 151643.
        num_return_sequences (int, optional): Number of sequences to return. Defaults to 1.
        min_gen_seq_length (int, optional): Minimum generated sequence length. Defaults to 0.
        max_gen_seq_length (int, optional): Maximum generated sequence length. Defaults to 512.
        repetition_penalty (float, optional): Repetition penalty. Defaults to 1.0.
        no_repeat_ngram_size (int, optional): N-gram size to avoid repeating. Defaults to 0.
        early_stopping (bool, optional): Whether to stop early. Defaults to True.
        length_penalty (float, optional): Length penalty. Defaults to 1.0.
        num_beam_groups (int, optional): Number of beam groups. Defaults to 1.
        diversity_penalty (float, optional): Diversity penalty. Defaults to 0.0.
        do_sample (bool, optional): Whether to use sampling. Defaults to False.
        temperature (float, optional): Sampling temperature. Defaults to 1.0.
        top_k (int, optional): Top-k sampling. Defaults to 50.
        top_p (float, optional): Top-p (nucleus) sampling. Defaults to 1.0.

    Returns:
        GenericOutputs: Generated sequences and their scores.
    """
    input_seq_length = input_ids.size(1)
    image_grid_thw = image_grid_thw.view(-1, image_grid_thw.size(-1))
    pixel_values = pixel_values.view(-1, pixel_values.size(-1))

    outputs = self.model.generate(
        input_ids=input_ids,
        pixel_values=pixel_values,
        image_grid_thw=image_grid_thw,
        max_length=max_gen_seq_length + input_seq_length,
        min_length=min_gen_seq_length + input_seq_length,
        num_beams=num_beams,
        do_sample=do_sample,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        length_penalty=length_penalty,
        repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences,
        bos_token_id=decoder_start_token_id,
        eos_token_id=decoder_end_token_id,
        pad_token_id=decoder_pad_token_id,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sequences = outputs.sequences.reshape(
        -1, num_return_sequences, outputs.sequences.size(-1)
    )
    padded = torch.full(
        (sequences.size(0), num_return_sequences, max_gen_seq_length),
        fill_value=decoder_start_token_id,
        device=sequences.device,
    )
    padded[:, :, : sequences.size(-1) - input_seq_length].copy_(
        sequences[:, :, input_seq_length : sequences.size(-1)]
    )

    if num_return_sequences == 1:
        padded = padded.reshape(-1, max_gen_seq_length)

    return GenericOutputs(
        sequences=padded.long(),
        sequences_scores=outputs.sequences_scores,
    )