Skip to content

unitorch.models.vllm¤

VLLMForGeneration¤

Text generation model backed by vLLM offline inference engine.

Wraps vllm.LLM for synchronous and asynchronous token generation. Accepts tokenized input_ids tensors (compatible with unitorch-infer) and returns token-ID tensors via GenerationOutputs.

Initializes the vLLM text generation engine.

Parameters:

Name Type Description Default
hf_name_or_folder str

Path to the HuggingFace model folder.

required
tensor_parallel_size int

Number of GPUs for tensor parallelism. Defaults to 1.

1
pipeline_parallel_size int

Number of GPUs for pipeline parallelism. Defaults to 1.

1
gpu_memory_utilization float

Fraction of GPU memory to reserve for vLLM. Defaults to 0.90.

0.9
max_model_len int

Maximum sequence length. None uses model default.

None
max_num_seqs int

Maximum number of concurrent sequences. Defaults to 256.

256
enable_prefix_caching bool

Enable automatic KV-cache prefix sharing. Defaults to True.

True
trust_remote_code bool

Allow remote model code execution. Defaults to True.

True
dtype str

Model weight dtype ("auto", "float16", "bfloat16"). Defaults to "auto".

'auto'
enforce_eager bool

Disable CUDA graph capture (useful for debugging). Defaults to False.

False
quantization str

Quantization method ("awq", "gptq", etc.).

None
Source code in src/unitorch/models/vllm/modeling.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(
    self,
    hf_name_or_folder: str,
    tensor_parallel_size: Optional[int] = 1,
    pipeline_parallel_size: Optional[int] = 1,
    gpu_memory_utilization: Optional[float] = 0.90,
    max_model_len: Optional[int] = None,
    max_num_seqs: Optional[int] = 256,
    enable_prefix_caching: Optional[bool] = True,
    trust_remote_code: Optional[bool] = True,
    dtype: Optional[str] = "auto",
    enforce_eager: Optional[bool] = False,
    quantization: Optional[str] = None,
):
    """
    Initializes the vLLM text generation engine.

    Args:
        hf_name_or_folder (str): Path to the HuggingFace model folder.
        tensor_parallel_size (int): Number of GPUs for tensor parallelism. Defaults to 1.
        pipeline_parallel_size (int): Number of GPUs for pipeline parallelism. Defaults to 1.
        gpu_memory_utilization (float): Fraction of GPU memory to reserve for vLLM. Defaults to 0.90.
        max_model_len (int, optional): Maximum sequence length. None uses model default.
        max_num_seqs (int): Maximum number of concurrent sequences. Defaults to 256.
        enable_prefix_caching (bool): Enable automatic KV-cache prefix sharing. Defaults to True.
        trust_remote_code (bool): Allow remote model code execution. Defaults to True.
        dtype (str): Model weight dtype (``"auto"``, ``"float16"``, ``"bfloat16"``). Defaults to ``"auto"``.
        enforce_eager (bool): Disable CUDA graph capture (useful for debugging). Defaults to False.
        quantization (str, optional): Quantization method (``"awq"``, ``"gptq"``, etc.).
    """
    kwargs = dict(
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_num_seqs=max_num_seqs,
        enable_prefix_caching=enable_prefix_caching,
        trust_remote_code=trust_remote_code,
        dtype=dtype,
        enforce_eager=enforce_eager,
    )
    if max_model_len is not None:
        kwargs["max_model_len"] = max_model_len
    if quantization is not None:
        kwargs["quantization"] = quantization

    self.llm = LLM(model=hf_name_or_folder, **kwargs)
    atexit.register(self.shutdown)

llm instance-attribute ¤

llm = LLM(model=hf_name_or_folder, **kwargs)

cuda ¤

cuda(device=None)
Source code in src/unitorch/models/vllm/modeling.py
67
68
69
70
def cuda(self, device=None):
    # vLLM manages GPU placement internally at engine init time.
    # This method exists for interface compatibility with nn.Module-based models.
    return self

eval ¤

eval()
Source code in src/unitorch/models/vllm/modeling.py
72
73
74
def eval(self):
    # vLLM is always in inference mode; this is a no-op for interface compatibility.
    return self

train ¤

train(mode=True)
Source code in src/unitorch/models/vllm/modeling.py
76
77
78
def train(self, mode=True):
    # vLLM does not support training mode; this is a no-op for interface compatibility.
    return self

from_checkpoint ¤

from_checkpoint(ckpt_dir, **kwargs)
Source code in src/unitorch/models/vllm/modeling.py
80
81
82
83
def from_checkpoint(self, ckpt_dir, **kwargs):
    # vLLM loads weights at engine init time from hf_name_or_folder.
    # Post-init checkpoint loading is not supported and is silently ignored.
    pass

shutdown ¤

shutdown()

Shutdown the vLLM engine and release GPU memory held by worker processes.

Source code in src/unitorch/models/vllm/modeling.py
85
86
87
88
89
90
def shutdown(self):
    """Shutdown the vLLM engine and release GPU memory held by worker processes."""
    try:
        self.llm.llm_engine.engine_core.shutdown()
    except Exception:
        pass

generate ¤

generate(
    input_ids: Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 0,
) -> List[List[List[int]]]

Generates token sequences for the given input_ids.

Parameters:

Name Type Description Default
input_ids Tensor

Input token ID tensor of shape (batch, seq_len).

required
max_gen_seq_length int

Maximum number of new tokens to generate. Defaults to 512.

512
min_gen_seq_length int

Minimum number of new tokens to generate. Defaults to 0.

0
num_return_sequences int

Number of completions per prompt. Defaults to 1.

1
num_beams int

Beam search width (used when use_beam_search=True). Defaults to 1.

required
do_sample bool

Enable sampling; when False uses greedy/beam decoding. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling parameter. Defaults to 50.

50
top_p float

Top-p (nucleus) sampling parameter. Defaults to 1.0.

1.0
repetition_penalty float

Penalty for token repetition. Defaults to 1.0.

1.0
length_penalty float

Exponential length penalty for beam search. Defaults to 1.0.

required
stop str or List[str]

Stop strings that terminate generation.

None
use_beam_search bool

Use beam search instead of sampling. Defaults to False.

required

Returns:

Type Description
List[List[List[int]]]

List[List[List[int]]]: Generated token ID sequences,

List[List[List[int]]]

shape [batch][num_return_sequences][seq_len].

Source code in src/unitorch/models/vllm/modeling.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def generate(
    self,
    input_ids: torch.Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 0,
) -> List[List[List[int]]]:
    """
    Generates token sequences for the given input_ids.

    Args:
        input_ids (torch.Tensor): Input token ID tensor of shape ``(batch, seq_len)``.
        max_gen_seq_length (int): Maximum number of new tokens to generate. Defaults to 512.
        min_gen_seq_length (int): Minimum number of new tokens to generate. Defaults to 0.
        num_return_sequences (int): Number of completions per prompt. Defaults to 1.
        num_beams (int): Beam search width (used when ``use_beam_search=True``). Defaults to 1.
        do_sample (bool): Enable sampling; when False uses greedy/beam decoding. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling parameter. Defaults to 50.
        top_p (float): Top-p (nucleus) sampling parameter. Defaults to 1.0.
        repetition_penalty (float): Penalty for token repetition. Defaults to 1.0.
        length_penalty (float): Exponential length penalty for beam search. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings that terminate generation.
        use_beam_search (bool): Use beam search instead of sampling. Defaults to False.

    Returns:
        List[List[List[int]]]: Generated token ID sequences,
        shape ``[batch][num_return_sequences][seq_len]``.
    """
    # Always stop at <|im_end|> (151645) and <|endoftext|> (151643) so that
    # vLLM does not generate past the model's answer turn into reasoning/thinking text.
    stop_token_ids = [151643, 151645]

    sampling_params = SamplingParams(
        n=num_return_sequences,
        max_tokens=max_gen_seq_length,
        min_tokens=min_gen_seq_length,
        temperature=temperature if do_sample else 0.0,
        top_k=top_k if do_sample else -1,
        top_p=top_p if do_sample else 1.0,
        repetition_penalty=repetition_penalty,
        stop=stop,
        stop_token_ids=stop_token_ids,
    )

    # Convert tensor rows to prompt_token_ids format (strips padding tokens)
    prompts = [
        {"prompt_token_ids": [t for t in row.tolist() if t != pad_token_id]}
        for row in input_ids
    ]

    outputs = self.llm.generate(prompts, sampling_params=sampling_params)
    return [[o.token_ids for o in req.outputs] for req in outputs]

async_generate async ¤

async_generate(
    input_ids: Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
) -> List[List[int]]

Asynchronously generates token sequences for a single-row input_ids tensor.

Parameters:

Name Type Description Default
input_ids Tensor

Input token ID tensor of shape (1, seq_len) or (seq_len,).

required
max_gen_seq_length int

Maximum tokens to generate. Defaults to 512.

512
min_gen_seq_length int

Minimum tokens to generate. Defaults to 0.

0
num_return_sequences int

Number of completions. Defaults to 1.

1
do_sample bool

Enable sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p sampling. Defaults to 1.0.

1.0
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
stop str or List[str]

Stop strings.

None

Returns:

Type Description
List[List[int]]

List[List[int]]: Generated token ID sequences for the single prompt.

Source code in src/unitorch/models/vllm/modeling.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
async def async_generate(
    self,
    input_ids: torch.Tensor,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
) -> List[List[int]]:
    """
    Asynchronously generates token sequences for a single-row input_ids tensor.

    Args:
        input_ids (torch.Tensor): Input token ID tensor of shape ``(1, seq_len)`` or ``(seq_len,)``.
        max_gen_seq_length (int): Maximum tokens to generate. Defaults to 512.
        min_gen_seq_length (int): Minimum tokens to generate. Defaults to 0.
        num_return_sequences (int): Number of completions. Defaults to 1.
        do_sample (bool): Enable sampling. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling. Defaults to 50.
        top_p (float): Top-p sampling. Defaults to 1.0.
        repetition_penalty (float): Repetition penalty. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings.

    Returns:
        List[List[int]]: Generated token ID sequences for the single prompt.
    """
    if input_ids.dim() == 1:
        input_ids = input_ids.unsqueeze(0)
    results = self.generate(
        input_ids=input_ids,
        max_gen_seq_length=max_gen_seq_length,
        min_gen_seq_length=min_gen_seq_length,
        num_return_sequences=num_return_sequences,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop=stop,
    )
    return results[0]

VLLMVLForGeneration¤

Vision-language generation model backed by vLLM offline inference engine.

Wraps vllm.LLM for multimodal (text + image) generation supporting both single and multi-image inputs via the vLLM multimodal data API. Accepts tokenized input_ids tensors and pixel-values tensors (compatible with unitorch-infer) in addition to raw PIL.Image inputs.

Initializes the vLLM vision-language generation engine.

Parameters:

Name Type Description Default
hf_name_or_folder str

Path to the HuggingFace model folder.

required
tensor_parallel_size int

Number of GPUs for tensor parallelism. Defaults to 1.

1
pipeline_parallel_size int

Number of GPUs for pipeline parallelism. Defaults to 1.

1
gpu_memory_utilization float

Fraction of GPU memory to reserve. Defaults to 0.90.

0.9
max_model_len int

Maximum total sequence length. None uses model default.

None
max_num_seqs int

Maximum concurrent sequences. Defaults to 128.

128
max_num_images int

Maximum images per request (vLLM limit_mm_per_prompt). Defaults to 8.

8
enable_prefix_caching bool

Enable KV-cache prefix sharing. Defaults to False.

False
trust_remote_code bool

Allow remote model code. Defaults to True.

True
dtype str

Weight dtype. Defaults to "auto".

'auto'
enforce_eager bool

Disable CUDA graph capture. Defaults to False.

False
quantization str

Quantization method.

None
Source code in src/unitorch/models/vllm/modeling_vl.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def __init__(
    self,
    hf_name_or_folder: str,
    tensor_parallel_size: Optional[int] = 1,
    pipeline_parallel_size: Optional[int] = 1,
    gpu_memory_utilization: Optional[float] = 0.90,
    max_model_len: Optional[int] = None,
    max_num_seqs: Optional[int] = 128,
    max_num_images: Optional[int] = 8,
    enable_prefix_caching: Optional[bool] = False,
    trust_remote_code: Optional[bool] = True,
    dtype: Optional[str] = "auto",
    enforce_eager: Optional[bool] = False,
    quantization: Optional[str] = None,
):
    """
    Initializes the vLLM vision-language generation engine.

    Args:
        hf_name_or_folder (str): Path to the HuggingFace model folder.
        tensor_parallel_size (int): Number of GPUs for tensor parallelism. Defaults to 1.
        pipeline_parallel_size (int): Number of GPUs for pipeline parallelism. Defaults to 1.
        gpu_memory_utilization (float): Fraction of GPU memory to reserve. Defaults to 0.90.
        max_model_len (int, optional): Maximum total sequence length. None uses model default.
        max_num_seqs (int): Maximum concurrent sequences. Defaults to 128.
        max_num_images (int): Maximum images per request (vLLM limit_mm_per_prompt). Defaults to 8.
        enable_prefix_caching (bool): Enable KV-cache prefix sharing. Defaults to False.
        trust_remote_code (bool): Allow remote model code. Defaults to True.
        dtype (str): Weight dtype. Defaults to ``"auto"``.
        enforce_eager (bool): Disable CUDA graph capture. Defaults to False.
        quantization (str, optional): Quantization method.
    """
    kwargs = dict(
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        gpu_memory_utilization=gpu_memory_utilization,
        max_num_seqs=max_num_seqs,
        limit_mm_per_prompt={"image": max_num_images},
        enable_prefix_caching=enable_prefix_caching,
        trust_remote_code=trust_remote_code,
        dtype=dtype,
        enforce_eager=enforce_eager,
        enable_mm_embeds=True,
    )
    if max_model_len is not None:
        kwargs["max_model_len"] = max_model_len
    if quantization is not None:
        kwargs["quantization"] = quantization

    self.llm = LLM(model=hf_name_or_folder, **kwargs)
    self.tokenizer = self.llm.get_tokenizer()
    atexit.register(self.shutdown)

llm instance-attribute ¤

llm = LLM(model=hf_name_or_folder, **kwargs)

tokenizer instance-attribute ¤

tokenizer = get_tokenizer()

cuda ¤

cuda(device=None)
Source code in src/unitorch/models/vllm/modeling_vl.py
75
76
77
78
def cuda(self, device=None):
    # vLLM manages GPU placement internally at engine init time.
    # This method exists for interface compatibility with nn.Module-based models.
    return self

eval ¤

eval()
Source code in src/unitorch/models/vllm/modeling_vl.py
80
81
82
def eval(self):
    # vLLM is always in inference mode; this is a no-op for interface compatibility.
    return self

train ¤

train(mode=True)
Source code in src/unitorch/models/vllm/modeling_vl.py
84
85
86
def train(self, mode=True):
    # vLLM does not support training mode; this is a no-op for interface compatibility.
    return self

from_checkpoint ¤

from_checkpoint(ckpt_dir, **kwargs)
Source code in src/unitorch/models/vllm/modeling_vl.py
88
89
90
91
def from_checkpoint(self, ckpt_dir, **kwargs):
    # vLLM loads weights at engine init time from hf_name_or_folder.
    # Post-init checkpoint loading is not supported and is silently ignored.
    pass

shutdown ¤

shutdown()

Shutdown the vLLM engine and release GPU memory held by worker processes.

Source code in src/unitorch/models/vllm/modeling_vl.py
93
94
95
96
97
98
def shutdown(self):
    """Shutdown the vLLM engine and release GPU memory held by worker processes."""
    try:
        self.llm.llm_engine.engine_core.shutdown()
    except Exception:
        pass

_decode_prompt ¤

_decode_prompt(token_ids: List[int]) -> str

Decode prompt token IDs back to the multimodal prompt string expected by vLLM.

The unitorch processor expands a single <|image_pad|> / <|video_pad|> placeholder into a long run of repeated special tokens based on the visual grid size. vLLM expects the unexpanded chat-template string and performs the multimodal expansion internally, so we collapse those runs before decoding the prompt text.

Source code in src/unitorch/models/vllm/modeling_vl.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def _decode_prompt(self, token_ids: List[int]) -> str:
    """
    Decode prompt token IDs back to the multimodal prompt string expected by vLLM.

    The unitorch processor expands a single ``<|image_pad|>`` / ``<|video_pad|>``
    placeholder into a long run of repeated special tokens based on the visual
    grid size. vLLM expects the *unexpanded* chat-template string and performs
    the multimodal expansion internally, so we collapse those runs before
    decoding the prompt text.
    """
    image_token_id = getattr(self.tokenizer, "image_token_id", None)
    if image_token_id is None and hasattr(self.tokenizer, "convert_tokens_to_ids"):
        image_token_id = self.tokenizer.convert_tokens_to_ids("<|image_pad|>")

    video_token_id = getattr(self.tokenizer, "video_token_id", None)
    if video_token_id is None and hasattr(self.tokenizer, "convert_tokens_to_ids"):
        video_token_id = self.tokenizer.convert_tokens_to_ids("<|video_pad|>")
    mm_token_ids = {
        token_id
        for token_id in (image_token_id, video_token_id)
        if token_id is not None
    }
    if mm_token_ids:
        collapsed = []
        for token_id in token_ids:
            if token_id in mm_token_ids and collapsed and collapsed[-1] == token_id:
                continue
            collapsed.append(token_id)
        token_ids = collapsed

    try:
        return self.tokenizer.batch_decode(
            [token_ids],
            skip_special_tokens=False,
            clean_up_tokenization_spaces=False,
        )[0]
    except TypeError:
        return self.tokenizer.decode(token_ids, skip_special_tokens=False)

_normalize_images ¤

_normalize_images(
    images: Optional[Union[Tensor, Image, List]],
    batch_size: int,
) -> Optional[List[Optional[List[Image]]]]

Normalize images input to List[Optional[List[PIL.Image]]] of length batch_size.

Accepts: - None: no images for any prompt. - torch.Tensor: shape (B, C, H, W) or (C, H, W) pixel-values tensor. - PIL.Image: single image shared across all prompts. - List[PIL.Image]: one image per prompt. - List[torch.Tensor]: one pixel-values tensor per prompt. - List[List[PIL.Image or torch.Tensor]]: multiple images per prompt.

Source code in src/unitorch/models/vllm/modeling_vl.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def _normalize_images(
    self,
    images: Optional[Union[torch.Tensor, Image.Image, List]],
    batch_size: int,
) -> Optional[List[Optional[List[Image.Image]]]]:
    """
    Normalize images input to ``List[Optional[List[PIL.Image]]]`` of length ``batch_size``.

    Accepts:
    - ``None``: no images for any prompt.
    - ``torch.Tensor``: shape ``(B, C, H, W)`` or ``(C, H, W)`` pixel-values tensor.
    - ``PIL.Image``: single image shared across all prompts.
    - ``List[PIL.Image]``: one image per prompt.
    - ``List[torch.Tensor]``: one pixel-values tensor per prompt.
    - ``List[List[PIL.Image or torch.Tensor]]``: multiple images per prompt.
    """
    if images is None:
        return None

    # torch.Tensor pixel_values batch (B, C, H, W) or single (C, H, W)
    if isinstance(images, torch.Tensor):
        if images.dim() == 4:
            return [[images[i]] for i in range(images.shape[0])]
        elif images.dim() == 3:
            return [[images]] * batch_size
        else:
            raise ValueError(f"Unexpected pixel_values shape: {images.shape}")

    if isinstance(images, Image.Image):
        return [[images]] * batch_size

    # List input
    if isinstance(images, list):
        result = []
        for item in images:
            if isinstance(item, (Image.Image, torch.Tensor)):
                result.append([item])
            elif isinstance(item, list):
                result.append(item)
            else:
                raise ValueError(f"Unexpected image type: {type(item)}")
        return result

    raise ValueError(f"Unsupported images type: {type(images)}")

generate ¤

generate(
    input_ids: Tensor,
    images: Optional[Union[Tensor, Image, List]] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 0,
) -> List[List[List[int]]]

Generates token sequences for the given text and image inputs.

Parameters:

Name Type Description Default
input_ids Tensor

Input token ID tensor of shape (batch, seq_len).

required
images Optional[Union[Tensor, Image, List]]

Input image(s). Accepts:

  • None — text-only generation.
  • torch.Tensor — pixel-values tensor (B, C, H, W) or (C, H, W).
  • PIL.Image — single image shared across all prompts.
  • List[PIL.Image] — one image per prompt.
  • List[torch.Tensor] — one pixel-values tensor per prompt.
  • List[List[PIL.Image or torch.Tensor]] — multiple images per prompt.
None
max_gen_seq_length int

Maximum new tokens. Defaults to 512.

512
min_gen_seq_length int

Minimum new tokens. Defaults to 0.

0
num_return_sequences int

Completions per prompt. Defaults to 1.

1
do_sample bool

Enable sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p sampling. Defaults to 1.0.

1.0
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
stop str or List[str]

Stop strings.

None

Returns:

Type Description
List[List[List[int]]]

List[List[List[int]]]: Generated token ID sequences,

List[List[List[int]]]

shape [batch][num_return_sequences][seq_len].

Source code in src/unitorch/models/vllm/modeling_vl.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def generate(
    self,
    input_ids: torch.Tensor,
    images: Optional[Union[torch.Tensor, Image.Image, List]] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
    pad_token_id: Optional[int] = 0,
) -> List[List[List[int]]]:
    """
    Generates token sequences for the given text and image inputs.

    Args:
        input_ids (torch.Tensor): Input token ID tensor of shape ``(batch, seq_len)``.
        images: Input image(s). Accepts:

            - ``None`` — text-only generation.
            - ``torch.Tensor`` — pixel-values tensor ``(B, C, H, W)`` or ``(C, H, W)``.
            - ``PIL.Image`` — single image shared across all prompts.
            - ``List[PIL.Image]`` — one image per prompt.
            - ``List[torch.Tensor]`` — one pixel-values tensor per prompt.
            - ``List[List[PIL.Image or torch.Tensor]]`` — multiple images per prompt.
        max_gen_seq_length (int): Maximum new tokens. Defaults to 512.
        min_gen_seq_length (int): Minimum new tokens. Defaults to 0.
        num_return_sequences (int): Completions per prompt. Defaults to 1.
        do_sample (bool): Enable sampling. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling. Defaults to 50.
        top_p (float): Top-p sampling. Defaults to 1.0.
        repetition_penalty (float): Repetition penalty. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings.

    Returns:
        List[List[List[int]]]: Generated token ID sequences,
        shape ``[batch][num_return_sequences][seq_len]``.
    """
    # Qwen3-VL may legitimately emit <|im_end|> as part of the assistant turn
    # boundary, so stopping on it can truncate the entire answer. Keep
    # <|endoftext|> as the only hard stop token.
    stop_token_ids = [151643]

    sampling_params = SamplingParams(
        n=num_return_sequences,
        max_tokens=max_gen_seq_length,
        min_tokens=min_gen_seq_length,
        temperature=temperature if do_sample else 0.0,
        top_k=top_k if do_sample else -1,
        top_p=top_p if do_sample else 1.0,
        repetition_penalty=repetition_penalty,
        stop=stop,
        stop_token_ids=stop_token_ids,
    )

    batch_size = input_ids.shape[0]
    normalized_images = self._normalize_images(images, batch_size)

    inputs = []
    for i, row in enumerate(input_ids):
        token_ids = [t for t in row.tolist() if t != pad_token_id]
        entry: Dict[str, Any]
        if normalized_images is not None and normalized_images[i]:
            imgs = normalized_images[i]
            entry = {"prompt": self._decode_prompt(token_ids)}
            entry["multi_modal_data"] = {
                "image": imgs[0] if len(imgs) == 1 else imgs
            }
        else:
            entry = {"prompt_token_ids": token_ids}
        inputs.append(entry)

    outputs = self.llm.generate(inputs, sampling_params=sampling_params)
    return [[o.token_ids for o in req.outputs] for req in outputs]

async_generate async ¤

async_generate(
    input_ids: Tensor,
    images: Optional[Union[Tensor, Image, List]] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
) -> List[List[int]]

Asynchronously generates token sequences for a single-row input.

Parameters:

Name Type Description Default
input_ids Tensor

Token ID tensor of shape (1, seq_len) or (seq_len,).

required
images Optional[Union[Tensor, Image, List]]

Optional image(s) for the single prompt (same formats as generate).

None
max_gen_seq_length int

Maximum tokens to generate. Defaults to 512.

512
min_gen_seq_length int

Minimum tokens to generate. Defaults to 0.

0
num_return_sequences int

Number of completions. Defaults to 1.

1
do_sample bool

Enable sampling. Defaults to False.

False
temperature float

Sampling temperature. Defaults to 1.0.

1.0
top_k int

Top-k sampling. Defaults to 50.

50
top_p float

Top-p sampling. Defaults to 1.0.

1.0
repetition_penalty float

Repetition penalty. Defaults to 1.0.

1.0
stop str or List[str]

Stop strings.

None

Returns:

Type Description
List[List[int]]

List[List[int]]: Generated token ID sequences for the single prompt.

Source code in src/unitorch/models/vllm/modeling_vl.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
async def async_generate(
    self,
    input_ids: torch.Tensor,
    images: Optional[Union[torch.Tensor, Image.Image, List]] = None,
    max_gen_seq_length: Optional[int] = 512,
    min_gen_seq_length: Optional[int] = 0,
    num_return_sequences: Optional[int] = 1,
    do_sample: Optional[bool] = False,
    temperature: Optional[float] = 1.0,
    top_k: Optional[int] = 50,
    top_p: Optional[float] = 1.0,
    repetition_penalty: Optional[float] = 1.0,
    stop: Optional[Union[str, List[str]]] = None,
) -> List[List[int]]:
    """
    Asynchronously generates token sequences for a single-row input.

    Args:
        input_ids (torch.Tensor): Token ID tensor of shape ``(1, seq_len)`` or ``(seq_len,)``.
        images: Optional image(s) for the single prompt (same formats as ``generate``).
        max_gen_seq_length (int): Maximum tokens to generate. Defaults to 512.
        min_gen_seq_length (int): Minimum tokens to generate. Defaults to 0.
        num_return_sequences (int): Number of completions. Defaults to 1.
        do_sample (bool): Enable sampling. Defaults to False.
        temperature (float): Sampling temperature. Defaults to 1.0.
        top_k (int): Top-k sampling. Defaults to 50.
        top_p (float): Top-p sampling. Defaults to 1.0.
        repetition_penalty (float): Repetition penalty. Defaults to 1.0.
        stop (str or List[str], optional): Stop strings.

    Returns:
        List[List[int]]: Generated token ID sequences for the single prompt.
    """
    if input_ids.dim() == 1:
        input_ids = input_ids.unsqueeze(0)
    results = self.generate(
        input_ids=input_ids,
        images=images,
        max_gen_seq_length=max_gen_seq_length,
        min_gen_seq_length=min_gen_seq_length,
        num_return_sequences=num_return_sequences,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop=stop,
    )
    return results[0]