Skip to content

unitorch.cli.models.vllm¤

QWen3VLLMForGeneration¤

Tip

core/model/vllm/generation/qwen3 is the section for configuration of QWen3VLLMForGeneration.

Bases: VLLMForGeneration

QWen3 text generation model using the vLLM inference engine.

Uses vLLM's offline batch engine for high-throughput inference. Accepts tokenized input_ids tensors and returns GenerationOutputs compatible with unitorch-infer.

Source code in src/unitorch/cli/models/vllm/modeling.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __init__(
    self,
    hf_name_or_folder: str,
    tensor_parallel_size: Optional[int] = 1,
    pipeline_parallel_size: Optional[int] = 1,
    gpu_memory_utilization: Optional[float] = 0.90,
    max_model_len: Optional[int] = None,
    max_num_seqs: Optional[int] = 256,
    enable_prefix_caching: Optional[bool] = True,
    dtype: Optional[str] = "auto",
    enforce_eager: Optional[bool] = False,
    quantization: Optional[str] = None,
):
    super().__init__(
        hf_name_or_folder=hf_name_or_folder,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        enable_prefix_caching=enable_prefix_caching,
        dtype=dtype,
        enforce_eager=enforce_eager,
        quantization=quantization,
    )

from_config classmethod ¤

from_config(config, **kwargs)
Source code in src/unitorch/cli/models/vllm/modeling.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
@config_defaults_init("core/model/vllm/generation/qwen3")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/model/vllm/generation/qwen3")
    pretrained_name = config.getoption("pretrained_name", "qwen3-4b-thinking")

    hf_name_or_folder = config.getoption("hf_name_or_folder", None)
    hf_name_or_folder = pop_value(
        hf_name_or_folder,
        nested_dict_value(
            pretrained_vllm_infos, pretrained_name, "hf_pretrained_name"
        ),
    )

    tensor_parallel_size = config.getoption("tensor_parallel_size", 1)
    pipeline_parallel_size = config.getoption("pipeline_parallel_size", 1)
    gpu_memory_utilization = config.getoption("gpu_memory_utilization", 0.90)
    max_model_len = config.getoption("max_model_len", None)
    max_num_seqs = config.getoption("max_num_seqs", 256)
    enable_prefix_caching = config.getoption("enable_prefix_caching", True)
    dtype = config.getoption("dtype", "auto")
    enforce_eager = config.getoption("enforce_eager", False)
    quantization = config.getoption("quantization", None)

    return cls(
        hf_name_or_folder=hf_name_or_folder,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        enable_prefix_caching=enable_prefix_caching,
        dtype=dtype,
        enforce_eager=enforce_eager,
        quantization=quantization,
    )

__call__ ¤

__call__(
    input_ids: Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    num_beams: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 151643,
) -> GenerationOutputs

Generates sequences for the given input token IDs.

Parameters:

Name Type Description Default
input_ids Tensor

Input token ID tensor of shape (batch, seq_len).

required
max_gen_seq_length int

Maximum tokens to generate. Defaults to 512.

512
min_gen_seq_length int

Minimum tokens to generate. Defaults to 0.

0
num_return_sequences int

Completions per prompt. Defaults to 1.

1
num_beams int

Beam search width. Defaults to 1.

1
do_sample bool

Enable sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p sampling. Defaults to 1.0.

1.0
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
stop str or List[str]

Stop strings.

None
pad_token_id int

Token ID used for padding. Defaults to 0.

151643

Returns:

Name Type Description
GenerationOutputs GenerationOutputs

Sequences tensor of shape (batch, num_return_sequences, max_gen_seq_length).

Source code in src/unitorch/cli/models/vllm/modeling.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def __call__(
    self,
    input_ids: torch.Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    num_beams: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 151643,
) -> GenerationOutputs:
    """
    Generates sequences for the given input token IDs.

    Args:
        input_ids (torch.Tensor): Input token ID tensor of shape ``(batch, seq_len)``.
        max_gen_seq_length (int): Maximum tokens to generate. Defaults to 512.
        min_gen_seq_length (int): Minimum tokens to generate. Defaults to 0.
        num_return_sequences (int): Completions per prompt. Defaults to 1.
        num_beams (int): Beam search width. Defaults to 1.
        do_sample (bool): Enable sampling. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling. Defaults to 50.
        top_p (float): Top-p sampling. Defaults to 1.0.
        repetition_penalty (float): Repetition penalty. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings.
        pad_token_id (int): Token ID used for padding. Defaults to 0.

    Returns:
        GenerationOutputs: Sequences tensor of shape ``(batch, num_return_sequences, max_gen_seq_length)``.
    """
    batch_token_ids = super().generate(
        input_ids=input_ids,
        max_gen_seq_length=max_gen_seq_length,
        min_gen_seq_length=min_gen_seq_length,
        num_return_sequences=num_return_sequences,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop=stop,
        pad_token_id=pad_token_id,
    )
    sequences = _pad_token_ids(batch_token_ids, pad_token_id, max_gen_seq_length)
    return GenerationOutputs(sequences=sequences)

QWen3VLVLLMForGeneration¤

Tip

core/model/vllm/generation/qwen3_vl is the section for configuration of QWen3VLVLLMForGeneration.

Bases: VLLMVLForGeneration

QWen3-VL vision-language generation model using the vLLM inference engine.

Uses vLLM's multimodal offline batch engine for high-throughput inference over text and image inputs. Accepts tokenized input_ids tensors and pixel-values tensors (or raw PIL.Image) and returns GenerationOutputs compatible with unitorch-infer.

Source code in src/unitorch/cli/models/vllm/modeling_vl.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    hf_name_or_folder: str,
    tensor_parallel_size: Optional[int] = 1,
    pipeline_parallel_size: Optional[int] = 1,
    gpu_memory_utilization: Optional[float] = 0.90,
    max_model_len: Optional[int] = None,
    max_num_seqs: Optional[int] = 128,
    max_num_images: Optional[int] = 8,
    enable_prefix_caching: Optional[bool] = False,
    dtype: Optional[str] = "auto",
    enforce_eager: Optional[bool] = False,
    quantization: Optional[str] = None,
):
    super().__init__(
        hf_name_or_folder=hf_name_or_folder,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        max_num_images=max_num_images,
        enable_prefix_caching=enable_prefix_caching,
        dtype=dtype,
        enforce_eager=enforce_eager,
        quantization=quantization,
    )

from_config classmethod ¤

from_config(config, **kwargs)
Source code in src/unitorch/cli/models/vllm/modeling_vl.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
@classmethod
@config_defaults_init("core/model/vllm/generation/qwen3_vl")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/model/vllm/generation/qwen3_vl")
    pretrained_name = config.getoption("pretrained_name", "qwen3-vl-2b-instruct")

    hf_name_or_folder = config.getoption("hf_name_or_folder", None)
    hf_name_or_folder = pop_value(
        hf_name_or_folder,
        nested_dict_value(
            pretrained_vllm_infos, pretrained_name, "hf_pretrained_name"
        ),
    )

    tensor_parallel_size = config.getoption("tensor_parallel_size", 1)
    pipeline_parallel_size = config.getoption("pipeline_parallel_size", 1)
    gpu_memory_utilization = config.getoption("gpu_memory_utilization", 0.90)
    max_model_len = config.getoption("max_model_len", None)
    max_num_seqs = config.getoption("max_num_seqs", 128)
    max_num_images = config.getoption("max_num_images", 8)
    enable_prefix_caching = config.getoption("enable_prefix_caching", False)
    dtype = config.getoption("dtype", "auto")
    enforce_eager = config.getoption("enforce_eager", False)
    quantization = config.getoption("quantization", None)

    return cls(
        hf_name_or_folder=hf_name_or_folder,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        max_num_images=max_num_images,
        enable_prefix_caching=enable_prefix_caching,
        dtype=dtype,
        enforce_eager=enforce_eager,
        quantization=quantization,
    )

__call__ ¤

__call__(
    input_ids: Tensor,
    pixel_values: Optional[Tensor] = None,
    image_grid_thw: Optional[Tensor] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 151643,
) -> GenerationOutputs

Generates sequences for the given text and image inputs.

Passes already-preprocessed pixel_values (shape (B, num_patches, channels)) and image_grid_thw directly to vLLM via mm_processor_kwargs, bypassing vLLM's own image pre-processing pipeline so that the unitorch processor output is used as-is (matching the HuggingFace reference implementation).

Parameters:

Name Type Description Default
input_ids Tensor

Input token ID tensor of shape (batch, seq_len).

required
pixel_values Tensor

Pre-processed patch tensor of shape (B, num_patches, channels) produced by the unitorch QWenVL processor.

None
image_grid_thw Tensor

Grid metadata tensor of shape (B, 3) containing (temporal, height, width) patch counts per sample.

None
max_gen_seq_length int

Maximum tokens to generate. Defaults to 512.

512
min_gen_seq_length int

Minimum tokens to generate. Defaults to 0.

0
num_return_sequences int

Completions per prompt. Defaults to 1.

1
do_sample bool

Enable sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p sampling. Defaults to 1.0.

1.0
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
stop str or List[str]

Stop strings.

None
pad_token_id int

Token ID used for padding. Defaults to 0.

151643

Returns:

Name Type Description
GenerationOutputs GenerationOutputs

Sequences tensor of shape (batch, num_return_sequences, max_gen_seq_length).

Source code in src/unitorch/cli/models/vllm/modeling_vl.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def __call__(
    self,
    input_ids: torch.Tensor,
    pixel_values: Optional[torch.Tensor] = None,
    image_grid_thw: Optional[torch.Tensor] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 151643,
) -> GenerationOutputs:
    """
    Generates sequences for the given text and image inputs.

    Passes already-preprocessed ``pixel_values`` (shape ``(B, num_patches, channels)``)
    and ``image_grid_thw`` directly to vLLM via ``mm_processor_kwargs``, bypassing
    vLLM's own image pre-processing pipeline so that the unitorch processor output
    is used as-is (matching the HuggingFace reference implementation).

    Args:
        input_ids (torch.Tensor): Input token ID tensor of shape ``(batch, seq_len)``.
        pixel_values (torch.Tensor, optional): Pre-processed patch tensor of shape
            ``(B, num_patches, channels)`` produced by the unitorch QWenVL processor.
        image_grid_thw (torch.Tensor, optional): Grid metadata tensor of shape
            ``(B, 3)`` containing ``(temporal, height, width)`` patch counts per sample.
        max_gen_seq_length (int): Maximum tokens to generate. Defaults to 512.
        min_gen_seq_length (int): Minimum tokens to generate. Defaults to 0.
        num_return_sequences (int): Completions per prompt. Defaults to 1.
        do_sample (bool): Enable sampling. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling. Defaults to 50.
        top_p (float): Top-p sampling. Defaults to 1.0.
        repetition_penalty (float): Repetition penalty. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings.
        pad_token_id (int): Token ID used for padding. Defaults to 0.

    Returns:
        GenerationOutputs: Sequences tensor of shape ``(batch, num_return_sequences, max_gen_seq_length)``.
    """
    from vllm import SamplingParams

    # Qwen3-VL may emit <|im_end|> before the visible answer content, so
    # treating it as a hard stop can collapse the response to an empty
    # string. Use <|endoftext|> as the only stop token here.
    stop_token_ids = [151643]

    sampling_params = SamplingParams(
        n=num_return_sequences,
        max_tokens=max_gen_seq_length,
        min_tokens=min_gen_seq_length,
        temperature=temperature if do_sample else 0.0,
        top_k=top_k if do_sample else -1,
        top_p=top_p if do_sample else 1.0,
        repetition_penalty=repetition_penalty,
        stop=stop,
        stop_token_ids=stop_token_ids,
    )

    batch_size = input_ids.shape[0]
    inputs = []
    for i in range(batch_size):
        token_ids = [t for t in input_ids[i].tolist() if t != pad_token_id]
        entry: Dict[str, Any] = {"prompt": self._decode_prompt(token_ids)}
        if pixel_values is not None and image_grid_thw is not None:
            grid_thw = image_grid_thw[i]
            if grid_thw.dim() == 1:
                grid_thw = grid_thw.unsqueeze(0)
            # Pass pre-processed patch tensor directly to the vLLM model via
            # multi_modal_data.  Qwen2VLMultiModalDataParserV2 (registered via
            # @replace in unitorch.models.vllm.modeling_vl) extends vLLM's
            # data parser to accept {"pixel_values", "image_grid_thw"} dicts,
            # routing them to DictEmbeddingItems so the vision encoder receives
            # the exact same patches that the unitorch Qwen2VLImageProcessor
            # produced — bypassing vLLM's own image preprocessing pipeline.
            entry["multi_modal_data"] = {
                "image": {
                    "pixel_values": pixel_values[i].to(torch.bfloat16),
                    "image_grid_thw": grid_thw,
                }
            }
        inputs.append(entry)

    outputs = self.llm.generate(inputs, sampling_params=sampling_params)
    batch_token_ids = [[o.token_ids for o in req.outputs] for req in outputs]
    sequences = _pad_token_ids(batch_token_ids, pad_token_id, max_gen_seq_length)
    return GenerationOutputs(sequences=sequences)