unitorch.models.diffusers¤

WanForText2VideoGeneration¤

Bases: GenericWanModel

Source code in src/unitorch/models/diffusers/modeling_wan.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    config2_path: Optional[str] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    boundary_ratio: Optional[float] = 0.9,
    seed: Optional[int] = 1123,
    gradient_checkpointing: Optional[bool] = True,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
    )
    if gradient_checkpointing:
        self.transformer.enable_gradient_checkpointing()
        if hasattr(self, "transformer2"):
            self.transformer2.enable_gradient_checkpointing()

    self.pipeline = WanPipeline(
        vae=self.vae,
        text_encoder=self.text,
        transformer=self.transformer,
        transformer_2=getattr(self, "transformer2", None),
        scheduler=self.scheduler,
        tokenizer=None,
        boundary_ratio=self.boundary_ratio,
    )
    self.pipeline.set_progress_bar_config(disable=True)

pipeline `instance-attribute` ¤

pipeline = WanPipeline(
    vae=vae,
    text_encoder=text,
    transformer=transformer,
    transformer_2=getattr(self, "transformer2", None),
    scheduler=scheduler,
    tokenizer=None,
    boundary_ratio=boundary_ratio,
)

forward ¤

forward(
    pixel_values: Tensor,
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/models/diffusers/modeling_wan.py

def forward(
    self,
    pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    latents = self.vae.encode(pixel_values).latent_dist.sample()
    noise = torch.randn(latents.shape).to(latents.device)
    batch = latents.shape[0]
    latents_mean = (
        torch.tensor(self.vae.config.latents_mean)
        .view(1, self.vae.config.z_dim, 1, 1, 1)
        .to(latents.device, latents.dtype)
    )
    latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(
        1, self.vae.config.z_dim, 1, 1, 1
    ).to(latents.device, latents.dtype)
    latents = (latents - latents_mean) * latents_std

    u = compute_density_for_timestep_sampling(
        weighting_scheme="none",
        batch_size=batch,
        logit_mean=0.0,
        logit_std=1.0,
        mode_scale=1.29,
    )
    use_transformer = random.random() < self.boundary_ratio
    max_timesteps = int(
        self.scheduler.config.num_train_timesteps * self.boundary_ratio
    )

    if use_transformer:
        indices = (u * max_timesteps).long()
    else:
        indices = (
            u * (self.scheduler.config.num_train_timesteps - max_timesteps)
        ).long() + int(max_timesteps)
    timesteps = self.scheduler.timesteps[indices].to(device=self.device)

    sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
    noise_latents = (1.0 - sigmas) * latents + sigmas * noise

    encoder_hidden_states = self.text(input_ids, attention_mask)[0]
    if use_transformer:
        outputs = self.transformer(
            noise_latents,
            timesteps,
            encoder_hidden_states=encoder_hidden_states,
        ).sample
    else:
        outputs = self.transformer2(
            noise_latents,
            timesteps,
            encoder_hidden_states=encoder_hidden_states,
        ).sample
    weighting = compute_loss_weighting_for_sd3(
        weighting_scheme="none", sigmas=sigmas
    )
    target = noise - latents
    loss = torch.mean(
        (weighting.float() * (outputs.float() - target.float()) ** 2).reshape(
            target.shape[0], -1
        ),
        1,
    )
    loss = loss.mean()
    return loss

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    height: Optional[int] = 480,
    width: Optional[int] = 832,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
)

Source code in src/unitorch/models/diffusers/modeling_wan.py

def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    height: Optional[int] = 480,
    width: Optional[int] = 832,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
):
    outputs = self.get_prompt_outputs(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
    )

    frames = self.pipeline(
        prompt_embeds=outputs.prompt_embeds,
        negative_prompt_embeds=outputs.negative_prompt_embeds,
        generator=torch.Generator(device=self.pipeline.device).manual_seed(
            self.seed
        ),
        num_inference_steps=self.num_infer_timesteps,
        height=height,
        width=width,
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        output_type="pt",
    ).frames

    return GenericOutputs(frames=frames)

WanForImage2VideoGeneration¤

Bases: GenericWanModel

Source code in src/unitorch/models/diffusers/modeling_wan.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    config2_path: Optional[str] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    boundary_ratio: Optional[float] = 0.9,
    seed: Optional[int] = 1123,
    gradient_checkpointing: Optional[bool] = True,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        config2_path=config2_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        boundary_ratio=boundary_ratio,
        seed=seed,
    )
    if gradient_checkpointing:
        self.transformer.enable_gradient_checkpointing()
        if hasattr(self, "transformer2"):
            self.transformer2.enable_gradient_checkpointing()

    self.pipeline = WanImageToVideoPipeline(
        vae=self.vae,
        text_encoder=self.text,
        transformer=self.transformer,
        transformer_2=getattr(self, "transformer2", None),
        scheduler=self.scheduler,
        tokenizer=None,
        image_processor=None,
        boundary_ratio=self.boundary_ratio,
    )
    self.pipeline.set_progress_bar_config(disable=True)

pipeline `instance-attribute` ¤

pipeline = WanImageToVideoPipeline(
    vae=vae,
    text_encoder=text,
    transformer=transformer,
    transformer_2=getattr(self, "transformer2", None),
    scheduler=scheduler,
    tokenizer=None,
    image_processor=None,
    boundary_ratio=boundary_ratio,
)

forward ¤

forward(
    pixel_values: Tensor,
    vae_pixel_values: Tensor,
    input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
)

Source code in src/unitorch/models/diffusers/modeling_wan.py

def forward(
    self,
    pixel_values: torch.Tensor,
    vae_pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
):
    latents = self.vae.encode(pixel_values).latent_dist.sample()
    noise = torch.randn(latents.shape).to(latents.device)
    batch = latents.shape[0]
    latents_mean = (
        torch.tensor(self.vae.config.latents_mean)
        .view(1, self.vae.config.z_dim, 1, 1, 1)
        .to(latents.device, latents.dtype)
    )
    latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(
        1, self.vae.config.z_dim, 1, 1, 1
    ).to(latents.device, latents.dtype)
    latents = (latents - latents_mean) * latents_std

    u = compute_density_for_timestep_sampling(
        weighting_scheme="none",
        batch_size=batch,
        logit_mean=0.0,
        logit_std=1.0,
        mode_scale=1.29,
    )
    use_transformer = random.random() < self.boundary_ratio
    max_timesteps = int(
        self.scheduler.config.num_train_timesteps * self.boundary_ratio
    )

    if use_transformer:
        indices = (u * max_timesteps).long()
    else:
        indices = (
            u * (self.scheduler.config.num_train_timesteps - max_timesteps)
        ).long() + int(max_timesteps)
    timesteps = self.scheduler.timesteps[indices].to(device=self.device)

    sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
    noise_latents = (1.0 - sigmas) * latents + sigmas * noise

    num_frames = pixel_values.shape[-3]

    video_condition = torch.cat(
        [
            vae_pixel_values.unsqueeze(2),
            vae_pixel_values.new_zeros(
                vae_pixel_values.shape[0],
                vae_pixel_values.shape[1],
                num_frames - 1,
                vae_pixel_values.shape[-2],
                vae_pixel_values.shape[-1],
                device=vae_pixel_values.device,
            ),
        ],
        dim=2,
    )
    latent_condition = self.vae.encode(video_condition).latent_dist.mode()
    latent_condition = latent_condition.to(latents.dtype)
    latent_condition = (latent_condition - latents_mean) * latents_std

    mask_lat_size = torch.ones(
        latents.shape[0],
        1,
        num_frames,
        latents.shape[-2],
        latents.shape[-1],
        device=latents.device,
        dtype=latents.dtype,
    )
    mask_lat_size[:, :, list(range(1, num_frames))] = 0
    first_frame_mask = mask_lat_size[:, :, 0:1]
    first_frame_mask = torch.repeat_interleave(
        first_frame_mask, dim=2, repeats=self.pipeline.vae_scale_factor_temporal
    )
    mask_lat_size = torch.concat(
        [first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2
    )
    mask_lat_size = mask_lat_size.view(
        latents.shape[0],
        -1,
        self.pipeline.vae_scale_factor_temporal,
        latents.shape[-2],
        latents.shape[-1],
    )
    mask_lat_size = mask_lat_size.transpose(1, 2)
    mask_lat_size = mask_lat_size.to(latent_condition.device)
    condition_latents = torch.concat([mask_lat_size, latent_condition], dim=1)
    latent_model_input = torch.cat([noise_latents, condition_latents], dim=1)

    encoder_hidden_states = self.text(input_ids, attention_mask)[0]
    if use_transformer:
        outputs = self.transformer(
            latent_model_input,
            timesteps,
            encoder_hidden_states=encoder_hidden_states,
        ).sample
    else:
        outputs = self.transformer2(
            latent_model_input,
            timesteps,
            encoder_hidden_states=encoder_hidden_states,
        ).sample
    weighting = compute_loss_weighting_for_sd3(
        weighting_scheme="none", sigmas=sigmas
    )
    target = noise - latents
    loss = torch.mean(
        (weighting.float() * (outputs.float() - target.float()) ** 2).reshape(
            target.shape[0], -1
        ),
        1,
    )
    loss = loss.mean()
    return loss

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    vae_pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
)

Source code in src/unitorch/models/diffusers/modeling_wan.py

def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    vae_pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    num_frames: Optional[int] = 81,
    guidance_scale: Optional[float] = 5.0,
):
    outputs = self.get_prompt_outputs(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
    )

    frames = self.pipeline(
        image=vae_pixel_values,
        prompt_embeds=outputs.prompt_embeds,
        negative_prompt_embeds=outputs.negative_prompt_embeds,
        generator=torch.Generator(device=self.pipeline.device).manual_seed(
            self.seed
        ),
        num_inference_steps=self.num_infer_timesteps,
        height=vae_pixel_values.size(-2),
        width=vae_pixel_values.size(-1),
        num_frames=num_frames,
        guidance_scale=guidance_scale,
        output_type="pt",
    ).frames

    return GenericOutputs(frames=frames)

QWenImageProcessor¤

Bases: HfTextClassificationProcessor

Initializes the QWenImageProcessor.

Parameters:

Name	Type	Description	Default
`vocab_path`	`str`	The path to the vocabulary file.	required
`merge_path`	`str`	The path to the merge file.	required
`max_seq_length`	`int`	The maximum sequence length for text inputs. Defaults to 12800.	`12800`

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def __init__(
    self,
    vocab_path: str,
    merge_path: str,
    vision_config_path: Optional[str] = None,
    vae_config_path: Optional[str] = None,
    tokenizer_config: Optional[str] = None,
    special_tokens_map: Optional[str] = None,
    max_seq_length: Optional[int] = 12800,
    image_size: Optional[Tuple[int, int]] = None,
    center_crop: Optional[bool] = False,
    random_flip: Optional[bool] = False,
):
    """
    Initializes the QWenImageProcessor.

    Args:
        vocab_path (str): The path to the vocabulary file.
        merge_path (str): The path to the merge file.
        max_seq_length (int, optional): The maximum sequence length for text inputs. Defaults to 12800.
    """
    tokenizer_config = read_json_file(tokenizer_config) if tokenizer_config else {}
    special_tokens_map = (
        read_json_file(special_tokens_map) if special_tokens_map else {}
    )
    added_tokens_decoder = tokenizer_config.pop("added_tokens_decoder", {})
    tokenizer_config = {
        k: (
            get_added_token(v)
            if isinstance(v, dict) and v.get("__type") == "AddedToken"
            else v
        )
        for k, v in tokenizer_config.items()
    }
    tokenizer = Qwen2TokenizerFast(
        vocab_file=vocab_path,
        merges_file=merge_path,
        **tokenizer_config,
    )
    for idx, spec in added_tokens_decoder.items():
        token = spec["content"]
        tokenizer.added_tokens_decoder[idx] = get_added_token(spec)
        tokenizer.added_tokens_encoder[token] = idx

    special_tokens = {}
    for name, spec in special_tokens_map.items():
        if not isinstance(spec, dict or str):
            continue
        special_tokens[name] = get_added_token(spec)
    tokenizer.add_special_tokens(special_tokens)

    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token

    super().__init__(
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
    )

    self.image_token = (
        "<|image_pad|>"
        if not hasattr(tokenizer, "image_token")
        else tokenizer.image_token
    )
    self.video_token = (
        "<|video_pad|>"
        if not hasattr(tokenizer, "video_token")
        else tokenizer.video_token
    )
    self.image_token_id = (
        tokenizer.image_token_id
        if getattr(tokenizer, "image_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.image_token)
    )
    self.video_token_id = (
        tokenizer.video_token_id
        if getattr(tokenizer, "video_token_id", None)
        else tokenizer.convert_tokens_to_ids(self.video_token)
    )

    if vision_config_path is not None:
        self.refer_vision_processor = Qwen2VLImageProcessor.from_json_file(
            vision_config_path
        )
    else:
        self.refer_vision_processor = None

    if image_size is not None:
        self.image_size = (
            image_size
            if isinstance(image_size, tuple)
            else (image_size, image_size)
        )
    else:
        self.image_size = None

    if self.image_size is not None:
        self.vision_processor = Compose(
            [
                Resize((self.image_size[1], self.image_size[0])),
                (
                    CenterCrop((self.image_size[1], self.image_size[0]))
                    if center_crop
                    else RandomCrop((self.image_size[1], self.image_size[0]))
                ),
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )
    else:
        self.vision_processor = Compose(
            [
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )

    if vae_config_path is not None:
        vae_config_dict = json.load(open(vae_config_path))
        vae_scale_factor = 2 ** (
            len(vae_config_dict.get("block_out_channels", [])) - 1
        )
        self.vae_image_processor = VaeImageProcessor(
            vae_scale_factor=vae_scale_factor * 2
        )
    else:
        self.vae_image_processor = None

    self.prompt_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
    self.prompt_template_editing = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
    self.prompt_start_index = 34
    self.prompt_editing_start_index = 64

image_token `instance-attribute` ¤

image_token = (
    "<|image_pad|>"
    if not hasattr(tokenizer, "image_token")
    else image_token
)

video_token `instance-attribute` ¤

video_token = (
    "<|video_pad|>"
    if not hasattr(tokenizer, "video_token")
    else video_token
)

image_token_id `instance-attribute` ¤

image_token_id = (
    image_token_id
    if getattr(tokenizer, "image_token_id", None)
    else convert_tokens_to_ids(image_token)
)

video_token_id `instance-attribute` ¤

video_token_id = (
    video_token_id
    if getattr(tokenizer, "video_token_id", None)
    else convert_tokens_to_ids(video_token)
)

refer_vision_processor `instance-attribute` ¤

refer_vision_processor = from_json_file(vision_config_path)

image_size `instance-attribute` ¤

image_size = (
    image_size
    if isinstance(image_size, tuple)
    else (image_size, image_size)
)

vision_processor `instance-attribute` ¤

vision_processor = Compose(
    [
        Resize((image_size[1], image_size[0])),
        (
            CenterCrop((image_size[1], image_size[0]))
            if center_crop
            else RandomCrop((image_size[1], image_size[0]))
        ),
        (
            RandomHorizontalFlip()
            if random_flip
            else Lambda(lambda x: x)
        ),
        ToTensor(),
        Normalize([0.5], [0.5]),
    ]
)

vae_image_processor `instance-attribute` ¤

vae_image_processor = VaeImageProcessor(
    vae_scale_factor=vae_scale_factor * 2
)

prompt_template `instance-attribute` ¤

prompt_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"

prompt_template_editing `instance-attribute` ¤

prompt_template_editing = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"

prompt_start_index `instance-attribute` ¤

prompt_start_index = 34

prompt_editing_start_index `instance-attribute` ¤

prompt_editing_start_index = 64

processing_image ¤

processing_image(image: Union[Image, str])

Process a reference image through the vision encoder.

Parameters:

Name	Type	Description	Default
`image`	`(Image, str)`	Input image or path.	required

Returns:

Name	Type	Description
`dict`		Vision processor outputs including pixel_values and image_grid_thw.

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def processing_image(
    self,
    image: Union[Image.Image, str],
):
    """
    Process a reference image through the vision encoder.

    Args:
        image (Image.Image, str): Input image or path.

    Returns:
        dict: Vision processor outputs including pixel_values and image_grid_thw.
    """
    if isinstance(image, str):
        image = Image.open(image)
    outputs = self.refer_vision_processor(images=[image], return_tensors="pt")
    return outputs

text2image_inputs ¤

text2image_inputs(
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def text2image_inputs(
    self,
    prompt: str,
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    max_seq_length = max_seq_length + self.prompt_start_index
    prompt = self.prompt_template.format(str(prompt))
    tokens = self.tokenizer.tokenize(prompt)
    tokens = tokens[:max_seq_length]
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    padding = [self.pad_token_id] * (max_seq_length - len(input_ids))
    attention_mask = [1] * len(input_ids) + [0] * len(padding)
    input_ids = input_ids + padding

    if negative_prompt is not None:
        negative_prompt = self.prompt_template.format(str(negative_prompt))
        negative_tokens = self.tokenizer.tokenize(negative_prompt)
        negative_tokens = negative_tokens[:max_seq_length]
        negative_input_ids = self.tokenizer.convert_tokens_to_ids(negative_tokens)
        negative_padding = [self.pad_token_id] * (
            max_seq_length - len(negative_input_ids)
        )
        negative_attention_mask = [1] * len(negative_input_ids) + [0] * len(
            negative_padding
        )
        negative_input_ids = negative_input_ids + negative_padding
    else:
        negative_input_ids = None
        negative_attention_mask = None

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        negative_input_ids=(
            torch.tensor(negative_input_ids, dtype=torch.long)
            if negative_input_ids is not None
            else None
        ),
        negative_attention_mask=(
            torch.tensor(negative_attention_mask, dtype=torch.long)
            if negative_attention_mask is not None
            else None
        ),
    )

text2image ¤

text2image(
    prompt: str,
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def text2image(
    self,
    prompt: str,
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    prompt_outputs = self.text2image_inputs(
        prompt=prompt,
        negative_prompt=None,
        max_seq_length=max_seq_length,
    )
    pixel_values = self.vision_processor(image)

    return GenericOutputs(
        input_ids=prompt_outputs.input_ids,
        attention_mask=prompt_outputs.attention_mask,
        pixel_values=pixel_values,
    )

editing_inputs ¤

editing_inputs(
    prompt: str,
    refer_image: Union[Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def editing_inputs(
    self,
    prompt: str,
    refer_image: Union[Image.Image, str],
    negative_prompt: Optional[str] = "",
    max_seq_length: Optional[int] = None,
):
    max_seq_length = pop_value(
        max_seq_length,
        self.max_seq_length,
    )
    max_seq_length = max_seq_length + self.prompt_editing_start_index
    prompt = self.prompt_template_editing.format(str(prompt))
    image_inputs = self.processing_image(refer_image)
    image_index, image_merge_size = 0, self.refer_vision_processor.merge_size**2
    image_grid_thw = image_inputs["image_grid_thw"]
    while self.image_token in prompt:
        num_image_tokens = image_grid_thw[image_index].prod() // image_merge_size
        prompt = prompt.replace(
            self.image_token,
            "<|placeholder|>" * num_image_tokens,
            1,
        )
        image_index += 1
    prompt = prompt.replace("<|placeholder|>", self.image_token)

    tokens = self.tokenizer.tokenize(prompt)
    tokens = tokens[:max_seq_length]
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    padding = [self.pad_token_id] * (max_seq_length - len(input_ids))
    attention_mask = [1] * len(input_ids) + [0] * len(padding)
    input_ids = input_ids + padding

    if negative_prompt is not None:
        negative_prompt = self.prompt_template_editing.format(str(negative_prompt))
        image_index, image_merge_size = 0, self.refer_vision_processor.merge_size**2
        while self.image_token in negative_prompt:
            num_image_tokens = (
                image_grid_thw[image_index].prod() // image_merge_size
            )
            negative_prompt = negative_prompt.replace(
                self.image_token,
                "<|placeholder|>" * num_image_tokens,
                1,
            )
            image_index += 1
        negative_prompt = negative_prompt.replace(
            "<|placeholder|>", self.image_token
        )
        negative_tokens = self.tokenizer.tokenize(negative_prompt)
        negative_tokens = negative_tokens[:max_seq_length]
        negative_input_ids = self.tokenizer.convert_tokens_to_ids(negative_tokens)
        negative_padding = [self.pad_token_id] * (
            max_seq_length - len(negative_input_ids)
        )
        negative_attention_mask = [1] * len(negative_input_ids) + [0] * len(
            negative_padding
        )
        negative_input_ids = negative_input_ids + negative_padding
    else:
        negative_input_ids = None
        negative_attention_mask = None

    refer_vae_pixel_values = self.vae_image_processor.preprocess(refer_image)[0]

    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return GenericOutputs(
        input_ids=torch.tensor(input_ids, dtype=torch.long),
        attention_mask=torch.tensor(attention_mask, dtype=torch.long),
        negative_input_ids=(
            torch.tensor(negative_input_ids, dtype=torch.long)
            if negative_input_ids is not None
            else None
        ),
        negative_attention_mask=(
            torch.tensor(negative_attention_mask, dtype=torch.long)
            if negative_attention_mask is not None
            else None
        ),
        refer_image_grid_thw=torch.tensor(
            image_inputs["image_grid_thw"], dtype=torch.long
        ),
        refer_pixel_values=torch.tensor(image_inputs["pixel_values"]),
        refer_vae_pixel_values=torch.tensor(refer_vae_pixel_values),
    )

editing ¤

editing(
    prompt: str,
    refer_image: Union[Image, str],
    image: Union[Image, str],
    max_seq_length: Optional[int] = None,
)

Source code in src/unitorch/models/diffusers/processing_qwen_image.py

def editing(
    self,
    prompt: str,
    refer_image: Union[Image.Image, str],
    image: Union[Image.Image, str],
    max_seq_length: Optional[int] = None,
):
    prompt_outputs = self.editing_inputs(
        prompt=prompt,
        refer_image=refer_image,
        negative_prompt=None,
        max_seq_length=max_seq_length,
    )
    pixel_values = self.vision_processor(image)

    return GenericOutputs(
        input_ids=prompt_outputs.input_ids,
        attention_mask=prompt_outputs.attention_mask,
        refer_image_grid_thw=prompt_outputs.refer_image_grid_thw,
        refer_pixel_values=prompt_outputs.refer_pixel_values,
        refer_vae_pixel_values=prompt_outputs.refer_vae_pixel_values,
        pixel_values=pixel_values,
    )

QWenImageText2ImageGeneration¤

Bases: GenericQWenImageModel

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    num_train_timesteps: int = 1000,
    num_infer_timesteps: int = 50,
    freeze_vae_encoder: bool = True,
    freeze_text_encoder: bool = True,
    snr_gamma: float = 5.0,
    seed: int = 1123,
    gradient_checkpointing: bool = True,
    guidance_scale: float = 1.0,
    prompt_start_index: int = 34,
) -> None:
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    if gradient_checkpointing:
        self.transformer.enable_gradient_checkpointing()

    self.pipeline = QwenImagePipeline(
        vae=self.vae,
        text_encoder=self.text,
        transformer=self.transformer,
        scheduler=self.scheduler,
        tokenizer=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)
    self.guidance_scale = guidance_scale
    self.prompt_start_index = prompt_start_index

pipeline `instance-attribute` ¤

pipeline = QwenImagePipeline(
    vae=vae,
    text_encoder=text,
    transformer=transformer,
    scheduler=scheduler,
    tokenizer=None,
)

guidance_scale `instance-attribute` ¤

guidance_scale = guidance_scale

prompt_start_index `instance-attribute` ¤

prompt_start_index = prompt_start_index

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
) -> Tensor

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    prompt_embeds, prompt_embeds_mask = self._encode_prompt(
        input_ids, attention_mask, self.prompt_start_index
    )

    if pixel_values.ndim == 4:
        pixel_values = pixel_values.unsqueeze(2)

    latents = self._normalize_latents(
        self.vae.encode(pixel_values).latent_dist.sample(), pixel_values.device
    )
    noise = torch.randn_like(latents)
    timesteps, sigmas = self._sample_timesteps_and_sigmas(latents)
    noise_latents = (1.0 - sigmas) * latents + sigmas * noise

    B, C, H, W = (
        latents.shape[0],
        latents.shape[1],
        latents.shape[2],
        latents.shape[3],
    )
    noise_latents = _pack_latents(noise_latents, B, C, H, W)

    vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
    height, width = pixel_values.size(-2), pixel_values.size(-1)
    img_shapes = [
        [(1, height // vae_scale_factor // 2, width // vae_scale_factor // 2)]
    ] * B

    pred = self.transformer(
        hidden_states=noise_latents,
        timestep=timesteps / 1000,
        guidance=self._make_guidance(B),
        encoder_hidden_states=prompt_embeds,
        encoder_hidden_states_mask=prompt_embeds_mask,
        img_shapes=img_shapes,
        txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
        return_dict=False,
    )[0]
    pred = _unpack_latents(
        pred, H * vae_scale_factor, W * vae_scale_factor, vae_scale_factor
    )
    return self._compute_flow_loss(pred, noise, latents, sigmas)

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    height: int = 1024,
    width: int = 1024,
    guidance_scale: float = 1.0,
    true_cfg_scale: float = 4.0,
) -> GenericOutputs

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    height: int = 1024,
    width: int = 1024,
    guidance_scale: float = 1.0,
    true_cfg_scale: float = 4.0,
) -> GenericOutputs:
    outputs = self.get_prompt_outputs(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
        prompt_start_index=self.prompt_start_index,
    )
    images = self.pipeline(
        prompt_embeds=outputs.prompt_embeds,
        negative_prompt_embeds=outputs.negative_prompt_embeds,
        prompt_embeds_mask=outputs.prompt_embeds_mask,
        negative_prompt_embeds_mask=outputs.negative_prompt_embeds_mask,
        generator=torch.Generator(device=self.pipeline.device).manual_seed(
            self.seed
        ),
        num_inference_steps=self.num_infer_timesteps,
        height=height,
        width=width,
        guidance_scale=guidance_scale,
        true_cfg_scale=true_cfg_scale,
        output_type="np.array",
    ).images
    return GenericOutputs(images=torch.from_numpy(images))

QWenImageEditingGeneration¤

Bases: GenericQWenImageModel

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    num_train_timesteps: int = 1000,
    num_infer_timesteps: int = 50,
    freeze_vae_encoder: bool = True,
    freeze_text_encoder: bool = True,
    snr_gamma: float = 5.0,
    seed: int = 1123,
    gradient_checkpointing: bool = True,
    guidance_scale: float = 1.0,
    prompt_start_index: int = 64,
) -> None:
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    if gradient_checkpointing:
        self.transformer.enable_gradient_checkpointing()

    self.pipeline = QwenImageEditPipeline(
        vae=self.vae,
        text_encoder=self.text,
        transformer=self.transformer,
        scheduler=self.scheduler,
        tokenizer=None,
        processor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)
    self.guidance_scale = guidance_scale
    self.prompt_start_index = prompt_start_index

pipeline `instance-attribute` ¤

pipeline = QwenImageEditPipeline(
    vae=vae,
    text_encoder=text,
    transformer=transformer,
    scheduler=scheduler,
    tokenizer=None,
    processor=None,
)

guidance_scale `instance-attribute` ¤

guidance_scale = guidance_scale

prompt_start_index `instance-attribute` ¤

prompt_start_index = prompt_start_index

forward ¤

forward(
    input_ids: Tensor,
    pixel_values: Tensor,
    refer_pixel_values: Tensor,
    refer_image_grid_thw: Tensor,
    refer_vae_pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
) -> Tensor

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

def forward(
    self,
    input_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    refer_pixel_values: torch.Tensor,
    refer_image_grid_thw: torch.Tensor,
    refer_vae_pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    prompt_embeds, prompt_embeds_mask = self._encode_prompt(
        input_ids,
        attention_mask,
        self.prompt_start_index,
        pixel_values=refer_pixel_values,
        image_grid_thw=refer_image_grid_thw,
    )

    if pixel_values.ndim == 4:
        pixel_values = pixel_values.unsqueeze(2)
    if refer_vae_pixel_values.ndim == 4:
        refer_vae_pixel_values = refer_vae_pixel_values.unsqueeze(2)

    latents = self._normalize_latents(
        self.vae.encode(pixel_values).latent_dist.sample(), pixel_values.device
    )
    refer_latents = self._normalize_latents(
        self.vae.encode(refer_vae_pixel_values).latent_dist.mode(),
        pixel_values.device,
    )

    noise = torch.randn_like(latents)
    timesteps, sigmas = self._sample_timesteps_and_sigmas(latents)
    noise_latents = (1.0 - sigmas) * latents + sigmas * noise

    B, C, H, W = (
        latents.shape[0],
        latents.shape[1],
        latents.shape[2],
        latents.shape[3],
    )
    noise_latents_packed = _pack_latents(noise_latents, B, C, H, W)
    rB, rC, rH, rW = refer_latents.shape
    refer_latents_packed = _pack_latents(refer_latents, rB, rC, rH, rW)
    latent_model_input = torch.cat(
        [noise_latents_packed, refer_latents_packed], dim=1
    )

    vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
    height, width = pixel_values.size(-2), pixel_values.size(-1)
    refer_height, refer_width = refer_vae_pixel_values.size(
        -2
    ), refer_vae_pixel_values.size(-1)
    img_shapes = [
        [
            (1, height // vae_scale_factor // 2, width // vae_scale_factor // 2),
            (
                1,
                refer_height // vae_scale_factor // 2,
                refer_width // vae_scale_factor // 2,
            ),
        ]
    ] * B

    pred = self.transformer(
        hidden_states=latent_model_input,
        timestep=timesteps / 1000,
        guidance=self._make_guidance(B),
        encoder_hidden_states=prompt_embeds,
        encoder_hidden_states_mask=prompt_embeds_mask,
        img_shapes=img_shapes,
        txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
        return_dict=False,
    )[0][:, : noise_latents_packed.shape[1]]

    pred = _unpack_latents(
        pred, H * vae_scale_factor, W * vae_scale_factor, vae_scale_factor
    )
    return self._compute_flow_loss(pred, noise, latents, sigmas)

generate ¤

generate(
    input_ids: Tensor,
    negative_input_ids: Tensor,
    refer_pixel_values: Tensor,
    refer_image_grid_thw: Tensor,
    refer_vae_pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    height: int = 1024,
    width: int = 1024,
    guidance_scale: float = 1.0,
    true_cfg_scale: float = 4.0,
) -> GenericOutputs

Source code in src/unitorch/models/diffusers/modeling_qwen_image.py

@torch.no_grad()
def generate(
    self,
    input_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    refer_pixel_values: torch.Tensor,
    refer_image_grid_thw: torch.Tensor,
    refer_vae_pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    height: int = 1024,
    width: int = 1024,
    guidance_scale: float = 1.0,
    true_cfg_scale: float = 4.0,
) -> GenericOutputs:
    outputs = self.get_prompt_outputs(
        input_ids=input_ids,
        negative_input_ids=negative_input_ids,
        pixel_values=refer_pixel_values,
        image_grid_thw=refer_image_grid_thw,
        attention_mask=attention_mask,
        negative_attention_mask=negative_attention_mask,
        prompt_start_index=self.prompt_start_index,
    )
    images = self.pipeline(
        image=refer_vae_pixel_values,
        prompt_embeds=outputs.prompt_embeds,
        negative_prompt_embeds=outputs.negative_prompt_embeds,
        prompt_embeds_mask=outputs.prompt_embeds_mask,
        negative_prompt_embeds_mask=outputs.negative_prompt_embeds_mask,
        generator=torch.Generator(device=self.pipeline.device).manual_seed(
            self.seed
        ),
        num_inference_steps=self.num_infer_timesteps,
        height=height,
        width=width,
        guidance_scale=guidance_scale,
        true_cfg_scale=true_cfg_scale,
        output_type="np.array",
    ).images
    return GenericOutputs(images=torch.from_numpy(images))

unitorch.models.diffusers¤

WanForText2VideoGeneration¤

pipeline instance-attribute ¤

forward ¤

generate ¤

WanForImage2VideoGeneration¤

pipeline instance-attribute ¤

forward ¤

generate ¤

QWenImageProcessor¤

image_token instance-attribute ¤

video_token instance-attribute ¤

image_token_id instance-attribute ¤

video_token_id instance-attribute ¤

refer_vision_processor instance-attribute ¤

image_size instance-attribute ¤

vision_processor instance-attribute ¤

vae_image_processor instance-attribute ¤

prompt_template instance-attribute ¤

prompt_template_editing instance-attribute ¤

prompt_start_index instance-attribute ¤

prompt_editing_start_index instance-attribute ¤

processing_image ¤

text2image_inputs ¤

text2image ¤

editing_inputs ¤

editing ¤

QWenImageText2ImageGeneration¤

pipeline instance-attribute ¤

guidance_scale instance-attribute ¤

prompt_start_index instance-attribute ¤

forward ¤

generate ¤

QWenImageEditingGeneration¤

pipeline instance-attribute ¤

guidance_scale instance-attribute ¤

prompt_start_index instance-attribute ¤

forward ¤

generate ¤

pipeline `instance-attribute` ¤

pipeline `instance-attribute` ¤

image_token `instance-attribute` ¤

video_token `instance-attribute` ¤

image_token_id `instance-attribute` ¤

video_token_id `instance-attribute` ¤

refer_vision_processor `instance-attribute` ¤

image_size `instance-attribute` ¤

vision_processor `instance-attribute` ¤

vae_image_processor `instance-attribute` ¤

prompt_template `instance-attribute` ¤

prompt_template_editing `instance-attribute` ¤

prompt_start_index `instance-attribute` ¤

prompt_editing_start_index `instance-attribute` ¤

pipeline `instance-attribute` ¤

guidance_scale `instance-attribute` ¤

prompt_start_index `instance-attribute` ¤

pipeline `instance-attribute` ¤

guidance_scale `instance-attribute` ¤

prompt_start_index `instance-attribute` ¤