unitorch.cli.models.diffusers¤

WanProcessor¤

Tip

core/process/diffusion/wan is the section for configuration of WanProcessor.

Bases: WanProcessor

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

def __init__(
    self,
    vocab_path: str,
    vae_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 512,
    position_start_id: Optional[int] = 0,
    video_size: Optional[Tuple[int, int]] = None,
):
    super().__init__(
        vocab_path=vocab_path,
        vae_config_path=vae_config_path,
        max_seq_length=max_seq_length,
        position_start_id=position_start_id,
        video_size=video_size,
    )

from_config `classmethod` ¤

from_config(config, **kwargs)

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

@classmethod
@config_defaults_init("core/process/diffusion/wan")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/process/diffusion/wan")
    pretrained_name = config.getoption("pretrained_name", "wan-v2.2-i2v-14b")
    pretrained_infos = nested_dict_value(pretrained_stable_infos, pretrained_name)

    vocab_path = config.getoption("vocab_path", None)
    vocab_path = pop_value(
        vocab_path,
        nested_dict_value(pretrained_infos, "text", "vocab"),
    )
    vocab_path = cached_path(vocab_path)

    vae_config_path = config.getoption("vae_config_path", None)
    vae_config_path = pop_value(
        vae_config_path,
        nested_dict_value(pretrained_infos, "vae", "config"),
    )
    vae_config_path = cached_path(vae_config_path)

    return {
        "vocab_path": vocab_path,
        "vae_config_path": vae_config_path,
    }

_text2video ¤

_text2video(
    prompt: str,
    video: Union[VideoCapture, str, List[Image]],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

@register_process("core/process/diffusion/wan/text2video")
def _text2video(
    self,
    prompt: str,
    video: Union[cv2.VideoCapture, str, List[Image.Image]],
    max_seq_length: Optional[int] = None,
):
    outputs = super().text2video(
        prompt=prompt,
        video=video,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        pixel_values=outputs.pixel_values,
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
    )

_text2video_inputs ¤

_text2video_inputs(
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

@register_process("core/process/diffusion/wan/text2video/inputs")
def _text2video_inputs(
    self,
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    outputs = super().text2video_inputs(
        prompt=prompt,
        negative_prompt=negative_prompt,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        input_ids=outputs.input_ids,
        negative_input_ids=outputs.negative_input_ids,
        attention_mask=outputs.attention_mask,
        negative_attention_mask=outputs.negative_attention_mask,
    )

_image2video ¤

_image2video(
    prompt: str,
    video: Union[VideoCapture, str, List[Image]],
    image: Optional[Union[Image, str]] = None,
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

@register_process("core/process/diffusion/wan/image2video")
def _image2video(
    self,
    prompt: str,
    video: Union[cv2.VideoCapture, str, List[Image.Image]],
    image: Optional[Union[Image.Image, str]] = None,
    max_seq_length: Optional[int] = None,
):
    outputs = super().image2video(
        prompt=prompt,
        video=video,
        image=image,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        pixel_values=outputs.pixel_values,
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        vae_pixel_values=outputs.vae_pixel_values,
    )

_image2video_inputs ¤

_image2video_inputs(
    prompt: str,
    image: Union[Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_wan.py

@register_process("core/process/diffusion/wan/image2video/inputs")
def _image2video_inputs(
    self,
    prompt: str,
    image: Union[Image.Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    outputs = super().image2video_inputs(
        prompt=prompt,
        image=image,
        negative_prompt=negative_prompt,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        vae_pixel_values=outputs.vae_pixel_values,
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        negative_input_ids=outputs.negative_input_ids,
        negative_attention_mask=outputs.negative_attention_mask,
    )

WanForText2VideoGeneration¤

Tip

core/model/diffusers/text2video/wan is the section for configuration of WanForText2VideoGeneration.

Bases: WanForText2VideoGeneration

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    config2_path: Optional[str] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    boundary_ratio: Optional[float] = 0.9,
    seed: Optional[int] = 1123,
    gradient_checkpointing: Optional[bool] = True,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
        gradient_checkpointing=gradient_checkpointing,
    )

from_config `classmethod` ¤

from_config(config, **kwargs)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@classmethod
@config_defaults_init("core/model/diffusers/text2video/wan")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/model/diffusers/text2video/wan")
    pretrained_name = config.getoption("pretrained_name", "wan-v2.2-t2v-14b")
    pretrained_infos = nested_dict_value(pretrained_stable_infos, pretrained_name)

    config_path = config.getoption("config_path", None)
    config_path = pop_value(
        config_path,
        nested_dict_value(pretrained_infos, "transformer", "config"),
    )
    config_path = cached_path(config_path)

    config2_path = config.getoption("config2_path", None)
    config2_path = pop_value(
        config2_path,
        nested_dict_value(pretrained_infos, "transformer2", "config"),
    )

    if config2_path is not None:
        config2_path = cached_path(config2_path)

    text_config_path = config.getoption("text_config_path", None)
    text_config_path = pop_value(
        text_config_path,
        nested_dict_value(pretrained_infos, "text", "config"),
    )
    text_config_path = cached_path(text_config_path)

    vae_config_path = config.getoption("vae_config_path", None)
    vae_config_path = pop_value(
        vae_config_path,
        nested_dict_value(pretrained_infos, "vae", "config"),
    )
    vae_config_path = cached_path(vae_config_path)

    scheduler_config_path = config.getoption("scheduler_config_path", None)
    scheduler_config_path = pop_value(
        scheduler_config_path,
        nested_dict_value(pretrained_infos, "scheduler"),
    )
    scheduler_config_path = cached_path(scheduler_config_path)

    num_train_timesteps = config.getoption("num_train_timesteps", 1000)
    num_infer_timesteps = config.getoption("num_infer_timesteps", 50)
    freeze_vae_encoder = config.getoption("freeze_vae_encoder", True)
    freeze_text_encoder = config.getoption("freeze_text_encoder", True)
    snr_gamma = config.getoption("snr_gamma", 5.0)
    boundary_ratio = config.getoption(
        "boundary_ratio",
        nested_dict_value(pretrained_infos, "boundary_ratio") or 0.9,
    )
    seed = config.getoption("seed", 1123)
    gradient_checkpointing = config.getoption("gradient_checkpointing", True)

    inst = cls(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
        gradient_checkpointing=gradient_checkpointing,
    )

    weight_path = config.getoption("pretrained_weight_path", None)

    state_dict = None
    if weight_path is None and pretrained_infos is not None:
        state_dict = [
            load_weight(
                nested_dict_value(pretrained_infos, "transformer", "weight"),
                prefix_keys={"": "transformer."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "transformer2", "weight"),
                prefix_keys={"": "transformer2."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "text", "weight"),
                prefix_keys={"": "text."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "vae", "weight"),
                prefix_keys={"": "vae."},
            ),
        ]

    inst.from_pretrained(weight_path, state_dict=state_dict)

    pretrained_lora_names = config.getoption("pretrained_lora_names", None)
    pretrained_lora_weights = config.getoption("pretrained_lora_weights", 1.0)
    pretrained_lora_alphas = config.getoption("pretrained_lora_alphas", 32.0)

    if isinstance(pretrained_lora_names, str):
        pretrained_lora_weights_path = nested_dict_value(
            pretrained_stable_extensions_infos,
            pretrained_lora_names,
            "lora",
            "weight",
        )
    elif isinstance(pretrained_lora_names, list):
        pretrained_lora_weights_path = [
            nested_dict_value(
                pretrained_stable_extensions_infos, name, "lora", "weight"
            )
            for name in pretrained_lora_names
        ]
    else:
        pretrained_lora_weights_path = None

    lora_weights_path = config.getoption(
        "pretrained_lora_weights_path", pretrained_lora_weights_path
    )
    if lora_weights_path is not None:
        inst.load_lora_weights(
            lora_files=lora_weights_path,
            lora_weights=pretrained_lora_weights,
            lora_alphas=pretrained_lora_alphas,
            replace_keys={},
            save_base_state=False,
        )
    return inst

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@autocast(
    device_type=("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=(torch.bfloat16 if is_bfloat16_available() else torch.float32),
)
def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    loss = super().forward(
        input_ids=input_ids,
        pixel_values=pixel_values,
        attention_mask=attention_mask,
    )
    return LossOutputs(loss=loss)

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    height: Optional[int] = 480,
    width: Optional[int] = 832,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@config_defaults_method("core/model/diffusers/text2video/wan")
@autocast(
    device_type=("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=(torch.bfloat16 if is_bfloat16_available() else torch.float32),
)
def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    height: Optional[int] = 480,
    width: Optional[int] = 832,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
):
    outputs = super().generate(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
        height=height,
        width=width,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
    )

    return DiffusionOutputs(outputs=outputs.frames.float())

WanForImage2VideoGeneration¤

Tip

core/model/diffusers/image2video/wan is the section for configuration of WanForImage2VideoGeneration.

Bases: WanForImage2VideoGeneration

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    config2_path: Optional[str] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    boundary_ratio: Optional[float] = 0.9,
    seed: Optional[int] = 1123,
    gradient_checkpointing: Optional[bool] = True,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
        gradient_checkpointing=gradient_checkpointing,
    )

from_config `classmethod` ¤

from_config(config, **kwargs)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@classmethod
@config_defaults_init("core/model/diffusers/image2video/wan")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/model/diffusers/image2video/wan")
    pretrained_name = config.getoption("pretrained_name", "wan-v2.2-i2v-14b")
    pretrained_infos = nested_dict_value(pretrained_stable_infos, pretrained_name)

    config_path = config.getoption("config_path", None)
    config_path = pop_value(
        config_path,
        nested_dict_value(pretrained_infos, "transformer", "config"),
    )
    config_path = cached_path(config_path)

    config2_path = config.getoption("config2_path", None)
    config2_path = pop_value(
        config2_path,
        nested_dict_value(pretrained_infos, "transformer2", "config"),
    )

    if config2_path is not None:
        config2_path = cached_path(config2_path)

    text_config_path = config.getoption("text_config_path", None)
    text_config_path = pop_value(
        text_config_path,
        nested_dict_value(pretrained_infos, "text", "config"),
    )
    text_config_path = cached_path(text_config_path)

    vae_config_path = config.getoption("vae_config_path", None)
    vae_config_path = pop_value(
        vae_config_path,
        nested_dict_value(pretrained_infos, "vae", "config"),
    )
    vae_config_path = cached_path(vae_config_path)

    scheduler_config_path = config.getoption("scheduler_config_path", None)
    scheduler_config_path = pop_value(
        scheduler_config_path,
        nested_dict_value(pretrained_infos, "scheduler"),
    )
    scheduler_config_path = cached_path(scheduler_config_path)

    num_train_timesteps = config.getoption("num_train_timesteps", 1000)
    num_infer_timesteps = config.getoption("num_infer_timesteps", 50)
    freeze_vae_encoder = config.getoption("freeze_vae_encoder", True)
    freeze_text_encoder = config.getoption("freeze_text_encoder", True)
    snr_gamma = config.getoption("snr_gamma", 5.0)
    boundary_ratio = config.getoption(
        "boundary_ratio",
        nested_dict_value(pretrained_infos, "boundary_ratio") or 0.9,
    )
    seed = config.getoption("seed", 1123)
    gradient_checkpointing = config.getoption("gradient_checkpointing", True)

    inst = cls(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
        gradient_checkpointing=gradient_checkpointing,
    )

    weight_path = config.getoption("pretrained_weight_path", None)

    state_dict = None
    if weight_path is None and pretrained_infos is not None:
        state_dict = [
            load_weight(
                nested_dict_value(pretrained_infos, "transformer", "weight"),
                prefix_keys={"": "transformer."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "transformer2", "weight"),
                prefix_keys={"": "transformer2."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "text", "weight"),
                prefix_keys={"": "text."},
            ),
            load_weight(
                nested_dict_value(pretrained_infos, "vae", "weight"),
                prefix_keys={"": "vae."},
            ),
        ]

    elif weight_path is not None:
        state_dict = load_weight(weight_path)

    if state_dict is not None:
        inst.from_pretrained(state_dict=state_dict)

    pretrained_lora_names = config.getoption("pretrained_lora_names", None)
    pretrained_lora_weights = config.getoption("pretrained_lora_weights", 1.0)
    pretrained_lora_alphas = config.getoption("pretrained_lora_alphas", 32.0)

    if isinstance(pretrained_lora_names, str):
        pretrained_lora_weights_path = nested_dict_value(
            pretrained_stable_extensions_infos,
            pretrained_lora_names,
            "lora",
            "weight",
        )
    elif isinstance(pretrained_lora_names, list):
        pretrained_lora_weights_path = [
            nested_dict_value(
                pretrained_stable_extensions_infos, name, "lora", "weight"
            )
            for name in pretrained_lora_names
        ]
    else:
        pretrained_lora_weights_path = None

    lora_weights_path = config.getoption(
        "pretrained_lora_weights_path", pretrained_lora_weights_path
    )
    if lora_weights_path is not None:
        inst.load_lora_weights(
            lora_files=lora_weights_path,
            lora_weights=pretrained_lora_weights,
            lora_alphas=pretrained_lora_alphas,
            replace_keys={},
            save_base_state=False,
        )
    return inst

forward ¤

forward(
    pixel_values: Tensor,
    vae_pixel_values: Tensor,
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@autocast(
    device_type=("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=(torch.bfloat16 if is_bfloat16_available() else torch.float32),
)
def forward(
    self,
    pixel_values: torch.Tensor,
    vae_pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    loss = super().forward(
        pixel_values=pixel_values,
        vae_pixel_values=vae_pixel_values,
        input_ids=input_ids,
        attention_mask=attention_mask,
    )
    return LossOutputs(loss=loss)

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    vae_pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
)

Source code in src/unitorch/cli/models/diffusers/modeling_wan.py

@config_defaults_method("core/model/diffusers/image2video/wan")
@autocast(
    device_type=("cuda" if torch.cuda.is_available() else "cpu"),
    dtype=(torch.bfloat16 if is_bfloat16_available() else torch.float32),
)
def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    vae_pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
):
    outputs = super().generate(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        vae_pixel_values=vae_pixel_values,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
    )

    return DiffusionOutputs(outputs=outputs.frames.float())

QWenImageProcessor¤

Tip

core/process/diffusion/qwen_image is the section for configuration of QWenImageProcessor.

Bases: QWenImageProcessor

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

def __init__(
    self,
    vocab_path: str,
    merge_path: str,
    vision_config_path: Optional[str] = None,
    vae_config_path: Optional[str] = None,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    max_seq_length: Optional[int] = 12800,
    image_size: Optional[Tuple[int, int]] = None,
    center_crop: Optional[bool] = False,
    random_flip: Optional[bool] = False,
):
    super().__init__(
        vocab_path=vocab_path,
        merge_path=merge_path,
        vision_config_path=vision_config_path,
        vae_config_path=vae_config_path,
        tokenizer_config=tokenizer_config,
        special_tokens_map=special_tokens_map,
        max_seq_length=max_seq_length,
        image_size=image_size,
        center_crop=center_crop,
        random_flip=random_flip,
    )

from_config `classmethod` ¤

from_config(config, **kwargs)

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

@classmethod
@config_defaults_init("core/process/diffusion/qwen_image")
def from_config(cls, config, **kwargs):
    config.set_default_section("core/process/diffusion/qwen_image")
    pretrained_name = config.getoption("pretrained_name", "qwen-image")
    pretrained_infos = nested_dict_value(pretrained_stable_infos, pretrained_name)

    vocab_path = config.getoption("vocab_path", None)
    vocab_path = pop_value(
        vocab_path,
        nested_dict_value(pretrained_infos, "text", "vocab"),
    )
    vocab_path = cached_path(vocab_path)

    merge_path = config.getoption("merge_path", None)
    merge_path = pop_value(
        merge_path,
        nested_dict_value(pretrained_infos, "text", "merge"),
    )
    merge_path = cached_path(merge_path)

    tokenizer_config = config.getoption("tokenizer_config", None)
    tokenizer_config = pop_value(
        tokenizer_config,
        nested_dict_value(pretrained_infos, "text", "tokenizer_config"),
        check_none=False,
    )
    tokenizer_config = (
        cached_path(tokenizer_config) if tokenizer_config is not None else None
    )

    special_tokens_map = config.getoption("special_tokens_map", None)
    special_tokens_map = pop_value(
        special_tokens_map,
        nested_dict_value(pretrained_infos, "text", "special_tokens_map"),
        check_none=False,
    )
    special_tokens_map = (
        cached_path(special_tokens_map) if special_tokens_map is not None else None
    )

    vision_config_path = config.getoption("vision_config_path", None)
    vision_config_path = pop_value(
        vision_config_path,
        nested_dict_value(pretrained_infos, "vision_config"),
        check_none=False,
    )
    if vision_config_path is not None:
        vision_config_path = cached_path(vision_config_path)

    vae_config_path = config.getoption("vae_config_path", None)
    vae_config_path = pop_value(
        vae_config_path,
        nested_dict_value(pretrained_infos, "vae", "config"),
    )
    vae_config_path = cached_path(vae_config_path)
    return {
        "vocab_path": vocab_path,
        "merge_path": merge_path,
        "vision_config_path": vision_config_path,
        "vae_config_path": vae_config_path,
        "tokenizer_config": tokenizer_config,
        "special_tokens_map": special_tokens_map,
    }

_text2image ¤

_text2image(
    prompt: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

@register_process("core/process/diffusion/qwen_image/text2image")
def _text2image(
    self,
    prompt: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    outputs = super().text2image(
        prompt=prompt,
        image=image,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        pixel_values=outputs.pixel_values,
    )

_text2image_inputs ¤

_text2image_inputs(
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

@register_process("core/process/diffusion/qwen_image/text2image/inputs")
def _text2image_inputs(
    self,
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    outputs = super().text2image_inputs(
        prompt=prompt,
        negative_prompt=negative_prompt,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        negative_input_ids=outputs.negative_input_ids,
        negative_attention_mask=outputs.negative_attention_mask,
    )

_editing_inputs ¤

_editing_inputs(
    prompt: str,
    refer_image: Union[Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

@register_process("core/process/diffusion/qwen_image/editing/inputs")
def _editing_inputs(
    self,
    prompt: str,
    refer_image: Union[Image.Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    outputs = super().editing_inputs(
        prompt=prompt,
        refer_image=refer_image,
        negative_prompt=negative_prompt,
        max_seq_length=max_seq_length,
    )

    return TensorInputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        refer_pixel_values=outputs.refer_pixel_values,
        refer_image_grid_thw=outputs.refer_image_grid_thw,
        refer_vae_pixel_values=outputs.refer_vae_pixel_values,
        negative_input_ids=outputs.negative_input_ids,
        negative_attention_mask=outputs.negative_attention_mask,
    )

_editing ¤

_editing(
    prompt: str,
    refer_image: Union[Image, str],
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/cli/models/diffusers/processing_qwen_image.py

@register_process("core/process/diffusion/qwen_image/editing")
def _editing(
    self,
    prompt: str,
    refer_image: Union[Image.Image, str],
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    outputs = super().editing(
        prompt=prompt,
        refer_image=refer_image,
        image=image,
        max_seq_length=max_seq_length,
    )
    return TensorInputs(
        input_ids=outputs.input_ids,
        attention_mask=outputs.attention_mask,
        pixel_values=outputs.pixel_values,
        refer_image_grid_thw=outputs.refer_image_grid_thw,
        refer_pixel_values=outputs.refer_pixel_values,
        refer_vae_pixel_values=outputs.refer_vae_pixel_values,
    )

QWenImageText2ImageGeneration¤

Tip

core/model/diffusers/text2image/qwen_image is the section for configuration of QWenImageText2ImageGeneration.

Bases: QWenImageText2ImageGeneration