unitorch.models.diffusers¤

StableProcessor¤

Bases: HfTextClassificationProcessor

Source code in src/unitorch/models/diffusers/processing_stable.py

def __init__(
    self,
    vocab_path: str,
    merge_path: str,
    vae_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 77,
    position_start_id: Optional[int] = 0,
    pad_token: Optional[str] = "<|endoftext|>",
    image_size: Optional[Tuple[int, int]] = None,
    center_crop: Optional[bool] = False,
    random_flip: Optional[bool] = False,
):
    tokenizer = CLIPTokenizer(
        vocab_file=vocab_path,
        merges_file=merge_path,
    )

    tokenizer.cls_token = tokenizer.bos_token
    tokenizer.sep_token = tokenizer.eos_token
    tokenizer.pad_token = pad_token

    HfTextClassificationProcessor.__init__(
        self,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        position_start_id=position_start_id,
    )

    if image_size is not None:
        self.image_size = (
            image_size
            if isinstance(image_size, tuple)
            else (image_size, image_size)
        )
    else:
        self.image_size = None
    if self.image_size is not None:
        self.vision_processor = Compose(
            [
                Resize((self.image_size[1], self.image_size[0])),
                CenterCrop((self.image_size[1], self.image_size[0]))
                if center_crop
                else RandomCrop((self.image_size[1], self.image_size[0])),
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )
        self.condition_vision_processor = Compose(
            [
                Resize((self.image_size[1], self.image_size[0])),
                CenterCrop((self.image_size[1], self.image_size[0])),
                ToTensor(),
            ]
        )
    else:
        self.vision_processor = Compose(
            [
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )
        self.condition_vision_processor = Compose(
            [
                ToTensor(),
            ]
        )

    if vae_config_path is not None:
        vae_config_dict = json.load(open(vae_config_path))
        vae_scale_factor = 2 ** (
            len(vae_config_dict.get("block_out_channels", [])) - 1
        )
        self.vae_image_processor = VaeImageProcessor(
            vae_scale_factor=vae_scale_factor
        )
        self.vae_condition_image_processor = VaeImageProcessor(
            vae_scale_factor=vae_scale_factor,
            do_convert_rgb=True,
            do_normalize=False,
        )
    else:
        self.vae_image_processor = None
        self.vae_condition_image_processor = None

StableForText2ImageGeneration¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_stable.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

StableForImage2ImageGeneration¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_stable.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionImg2ImgPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

StableForImageInpainting¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_stable.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionInpaintPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)
    self.num_channels_unet = self.unet.config.in_channels

StableForImageResolution¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_stable.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionUpscalePipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        scheduler=self.scheduler,
        low_res_scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

StableXLProcessor¤

Source code in src/unitorch/models/diffusers/processing_stable_xl.py

def __init__(
    self,
    vocab_path: str,
    merge_path: str,
    vocab2_path: str,
    merge2_path: str,
    vae_config_path: Optional[str] = None,
    max_seq_length: Optional[int] = 77,
    position_start_id: Optional[int] = 0,
    pad_token: Optional[str] = "<|endoftext|>",
    pad_token2: Optional[str] = "!",
    image_size: Optional[Tuple[int, int]] = None,
    center_crop: Optional[bool] = False,
    random_flip: Optional[bool] = False,
):
    tokenizer1 = CLIPTokenizer(
        vocab_file=vocab_path,
        merges_file=merge_path,
    )

    tokenizer1.cls_token = tokenizer1.bos_token
    tokenizer1.sep_token = tokenizer1.eos_token
    tokenizer1.pad_token = pad_token

    self.text_processor1 = HfTextClassificationProcessor(
        tokenizer=tokenizer1,
        max_seq_length=max_seq_length,
        position_start_id=position_start_id,
    )

    tokenizer2 = CLIPTokenizer(
        vocab_file=vocab2_path,
        merges_file=merge2_path,
    )

    tokenizer2.cls_token = tokenizer2.bos_token
    tokenizer2.sep_token = tokenizer2.eos_token
    tokenizer2.pad_token = pad_token2

    self.text_processor2 = HfTextClassificationProcessor(
        tokenizer=tokenizer2,
        max_seq_length=max_seq_length,
        position_start_id=position_start_id,
    )

    if image_size is not None:
        self.image_size = (
            image_size
            if isinstance(image_size, tuple)
            else (image_size, image_size)
        )
    else:
        self.image_size = None

    if self.image_size is not None:
        self.vision_processor = Compose(
            [
                Resize((self.image_size[1], self.image_size[0])),
                CenterCrop((self.image_size[1], self.image_size[0]))
                if center_crop
                else RandomCrop((self.image_size[1], self.image_size[0])),
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )

        self.condition_vision_processor = Compose(
            [
                Resize((self.image_size[1], self.image_size[0])),
                CenterCrop((self.image_size[1], self.image_size[0])),
                ToTensor(),
            ]
        )
    else:
        self.vision_processor = Compose(
            [
                RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
                ToTensor(),
                Normalize([0.5], [0.5]),
            ]
        )

        self.condition_vision_processor = Compose(
            [
                ToTensor(),
            ]
        )

    if vae_config_path is not None:
        vae_config_dict = json.load(open(vae_config_path))
        vae_scale_factor = 2 ** (
            len(vae_config_dict.get("block_out_channels", [])) - 1
        )
        self.vae_image_processor = VaeImageProcessor(
            vae_scale_factor=vae_scale_factor
        )
        self.vae_condition_image_processor = VaeImageProcessor(
            vae_scale_factor=vae_scale_factor,
            do_convert_rgb=True,
            do_normalize=False,
        )
    else:
        self.vae_image_processor = None
        self.vae_condition_image_processor = None

StableXLForText2ImageGeneration¤

Bases: GenericStableXLModel

Source code in src/unitorch/models/diffusers/modeling_stable_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionXLPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

StableXLForImage2ImageGeneration¤

Bases: GenericStableXLModel

Source code in src/unitorch/models/diffusers/modeling_stable_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionXLImg2ImgPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

StableXLForImageInpainting¤

Bases: GenericStableXLModel

Source code in src/unitorch/models/diffusers/modeling_stable_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )

    self.pipeline = StableDiffusionXLInpaintPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)
    self.num_channels_unet = self.unet.config.in_channels

ControlNetForText2ImageGeneration¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_controlnet.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionControlNetPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

ControlNetForImage2ImageGeneration¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_controlnet.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionControlNetImg2ImgPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

ControlNetForImageInpainting¤

Bases: GenericStableModel

Source code in src/unitorch/models/diffusers/modeling_controlnet.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    inpainting_controlnet_config_path: Union[str] = None,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        inpainting_controlnet_config_path=inpainting_controlnet_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionControlNetInpaintPipeline(
        vae=self.vae,
        text_encoder=self.text,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        safety_checker=None,
        feature_extractor=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

ControlNetXLForText2ImageGeneration¤

Bases: GenericStableXLModel

ControlNetXL model for text-to-image generation.

Parameters:

Name	Type	Description	Default
`config_path`	`str`	Path to the model configuration file.	required
`text_config_path`	`str`	Path to the text model configuration file.	required
`text2_config_path`	`str`	Path to the second text model configuration file.	required
`vae_config_path`	`str`	Path to the VAE model configuration file.	required
`controlnet_configs_path`	`str`	Path to the ControlNet model configuration file.	required
`scheduler_config_path`	`str`	Path to the scheduler configuration file.	required
`quant_config_path`	`Optional[str]`	Path to the quantization configuration file (default: None).	`None`
`image_size`	`Optional[int]`	Size of the input image (default: None).	`None`
`in_channels`	`Optional[int]`	Number of input channels (default: None).	`None`
`out_channels`	`Optional[int]`	Number of output channels (default: None).	`None`
`num_train_timesteps`	`Optional[int]`	Number of training timesteps (default: 1000).	`1000`
`num_infer_timesteps`	`Optional[int]`	Number of inference timesteps (default: 50).	`50`
`freeze_vae_encoder`	`Optional[bool]`	Whether to freeze the VAE encoder (default: True).	`True`
`freeze_text_encoder`	`Optional[bool]`	Whether to freeze the text encoder (default: True).	`True`
`freeze_unet_encoder`	`Optional[bool]`	Whether to freeze the UNet encoder (default: True).	`False`
`seed`	`Optional[int]`	Random seed (default: 1123).	`1123`

Source code in src/unitorch/models/diffusers/modeling_controlnet_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionXLControlNetPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

forward ¤

forward(
    input_ids: Tensor,
    input2_ids: Tensor,
    add_time_ids: Tensor,
    pixel_values: Tensor,
    condition_pixel_values: Tensor,
    attention_mask: Optional[Tensor] = None,
    attention2_mask: Optional[Tensor] = None,
)

Forward pass of the model.

Parameters:

Name	Type	Description	Default
`input_ids`	`Tensor`	Input IDs.	required
`input2_ids`	`Tensor`	Second input IDs.	required
`add_time_ids`	`Tensor`	Additional time IDs.	required
`pixel_values`	`Tensor`	Pixel values.	required
`condition_pixel_values`	`Tensor`	Condition pixel values.	required
`attention_mask`	`Optional[Tensor]`	Attention mask (default: None).	`None`
`attention2_mask`	`Optional[Tensor]`	Second attention mask (default: None).	`None`

Returns:

Type	Description
	torch.Tensor: Loss value.

Source code in src/unitorch/models/diffusers/modeling_controlnet_xl.py

def forward(
    self,
    input_ids: torch.Tensor,
    input2_ids: torch.Tensor,
    add_time_ids: torch.Tensor,
    pixel_values: torch.Tensor,
    condition_pixel_values: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    attention2_mask: Optional[torch.Tensor] = None,
):
    """
    Forward pass of the model.

    Args:
        input_ids (torch.Tensor): Input IDs.
        input2_ids (torch.Tensor): Second input IDs.
        add_time_ids (torch.Tensor): Additional time IDs.
        pixel_values (torch.Tensor): Pixel values.
        condition_pixel_values (torch.Tensor): Condition pixel values.
        attention_mask (Optional[torch.Tensor]): Attention mask (default: None).
        attention2_mask (Optional[torch.Tensor]): Second attention mask (default: None).

    Returns:
        torch.Tensor: Loss value.
    """
    prompt_outputs = self.text(
        input_ids,
        # attention_mask,
        output_hidden_states=True,
    )
    prompt_embeds = prompt_outputs.hidden_states[-2]
    prompt2_outputs = self.text2(
        input2_ids,
        # attention2_mask,
        output_hidden_states=True,
    )
    prompt2_embeds = prompt2_outputs.hidden_states[-2]
    prompt_embeds = torch.concat([prompt_embeds, prompt2_embeds], dim=-1)
    pooled_prompt_embeds = prompt2_outputs[0]

    latents = self.vae.encode(pixel_values).latent_dist.sample()
    latents = latents * self.vae.config.scaling_factor

    noise = torch.randn(latents.shape).to(latents.device)
    batch = latents.size(0)

    timesteps = torch.randint(
        0,
        self.scheduler.config.num_train_timesteps,
        (batch,),
        device=pixel_values.device,
    ).long()

    noise_latents = self.scheduler.add_noise(
        latents,
        noise,
        timesteps,
    )

    down_block_res_samples, mid_block_res_sample = self.controlnet(
        noise_latents,
        timesteps,
        encoder_hidden_states=prompt_embeds,
        controlnet_cond=condition_pixel_values,
        added_cond_kwargs={
            "time_ids": add_time_ids,
            "text_embeds": pooled_prompt_embeds,
        },
        return_dict=False,
    )
    outputs = self.unet(
        noise_latents,
        timesteps,
        prompt_embeds,
        added_cond_kwargs={
            "time_ids": add_time_ids,
            "text_embeds": pooled_prompt_embeds,
        },
        down_block_additional_residuals=down_block_res_samples,
        mid_block_additional_residual=mid_block_res_sample,
    ).sample

    if self.scheduler.config.prediction_type == "v_prediction":
        noise = self.scheduler.get_velocity(latents, noise, timesteps)

    loss = F.mse_loss(outputs, noise, reduction="mean")
    return loss

generate ¤

generate(
    condition_pixel_values: Tensor,
    input_ids: Tensor,
    input2_ids: Tensor,
    negative_input_ids: Tensor,
    negative_input2_ids: Tensor,
    attention_mask: Optional[Tensor] = None,
    attention2_mask: Optional[Tensor] = None,
    negative_attention_mask: Optional[Tensor] = None,
    negative_attention2_mask: Optional[Tensor] = None,
    height: Optional[int] = 1024,
    width: Optional[int] = 1024,
    guidance_scale: Optional[float] = 5.0,
    controlnet_conditioning_scale: Optional[
        Union[float, List[float]]
    ] = 1.0,
)

Generate images using the model.

Parameters:

Name	Type	Description	Default
`condition_pixel_values`	`Tensor`	Condition pixel values.	required
`input_ids`	`Tensor`	Input IDs.	required
`input2_ids`	`Tensor`	Second input IDs.	required
`negative_input_ids`	`Tensor`	Negative input IDs.	required
`negative_input2_ids`	`Tensor`	Negative second input IDs.	required
`attention_mask`	`Optional[Tensor]`	Attention mask (default: None).	`None`
`attention2_mask`	`Optional[Tensor]`	Second attention mask (default: None).	`None`
`negative_attention_mask`	`Optional[Tensor]`	Negative attention mask (default: None).	`None`
`negative_attention2_mask`	`Optional[Tensor]`	Negative second attention mask (default: None).	`None`
`height`	`Optional[int]`	Height of the generated images (default: 1024).	`1024`
`width`	`Optional[int]`	Width of the generated images (default: 1024).	`1024`
`guidance_scale`	`Optional[float]`	Scale for guidance (default: 5.0).	`5.0`

Returns:

Name	Type	Description
`GenericOutputs`		Generated images.

Source code in src/unitorch/models/diffusers/modeling_controlnet_xl.py

def generate(
    self,
    condition_pixel_values: torch.Tensor,
    input_ids: torch.Tensor,
    input2_ids: torch.Tensor,
    negative_input_ids: torch.Tensor,
    negative_input2_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    attention2_mask: Optional[torch.Tensor] = None,
    negative_attention_mask: Optional[torch.Tensor] = None,
    negative_attention2_mask: Optional[torch.Tensor] = None,
    height: Optional[int] = 1024,
    width: Optional[int] = 1024,
    guidance_scale: Optional[float] = 5.0,
    controlnet_conditioning_scale: Optional[Union[float, List[float]]] = 1.0,
):
    """
    Generate images using the model.

    Args:
        condition_pixel_values (torch.Tensor): Condition pixel values.
        input_ids (torch.Tensor): Input IDs.
        input2_ids (torch.Tensor): Second input IDs.
        negative_input_ids (torch.Tensor): Negative input IDs.
        negative_input2_ids (torch.Tensor): Negative second input IDs.
        attention_mask (Optional[torch.Tensor]): Attention mask (default: None).
        attention2_mask (Optional[torch.Tensor]): Second attention mask (default: None).
        negative_attention_mask (Optional[torch.Tensor]): Negative attention mask (default: None).
        negative_attention2_mask (Optional[torch.Tensor]): Negative second attention mask (default: None).
        height (Optional[int]): Height of the generated images (default: 1024).
        width (Optional[int]): Width of the generated images (default: 1024).
        guidance_scale (Optional[float]): Scale for guidance (default: 5.0).

    Returns:
        GenericOutputs: Generated images.
    """
    outputs = self.get_prompt_outputs(
        input_ids=input_ids,
        input2_ids=input2_ids,
        negative_input_ids=negative_input_ids,
        negative_input2_ids=negative_input2_ids,
        attention_mask=attention_mask,
        attention2_mask=attention2_mask,
        negative_attention_mask=negative_attention_mask,
        negative_attention2_mask=negative_attention2_mask,
    )
    if controlnet_conditioning_scale is None:
        if self.num_controlnets == 1:
            controlnet_conditioning_scale = 1.0
        else:
            controlnet_conditioning_scale = [1.0] * self.num_controlnets
    elif (
        not isinstance(controlnet_conditioning_scale, list)
        and self.num_controlnets > 1
    ):
        controlnet_conditioning_scale = [
            controlnet_conditioning_scale
        ] * self.num_controlnets

    images = self.pipeline(
        image=condition_pixel_values
        if self.num_controlnets == 1
        else list(condition_pixel_values.transpose(0, 1)),
        prompt_embeds=outputs.prompt_embeds,
        negative_prompt_embeds=outputs.negative_prompt_embeds,
        pooled_prompt_embeds=outputs.pooled_prompt_embeds,
        negative_pooled_prompt_embeds=outputs.negative_pooled_prompt_embeds,
        generator=torch.Generator(device=self.pipeline.device).manual_seed(
            self.seed
        ),
        num_inference_steps=self.num_infer_timesteps,
        height=height,
        width=width,
        guidance_scale=guidance_scale,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        output_type="np.array",
    ).images

    return GenericOutputs(images=torch.from_numpy(images))

ControlNetXLForImage2ImageGeneration¤

Bases: GenericStableXLModel

Source code in src/unitorch/models/diffusers/modeling_controlnet_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionXLControlNetImg2ImgPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)

ControlNetXLForImageInpainting¤

Bases: GenericStableXLModel

Source code in src/unitorch/models/diffusers/modeling_controlnet_xl.py

def __init__(
    self,
    config_path: str,
    text_config_path: str,
    text2_config_path: str,
    vae_config_path: str,
    controlnet_configs_path: Union[str, List[str]],
    scheduler_config_path: str,
    inpainting_controlnet_config_path: Union[str] = None,
    quant_config_path: Optional[str] = None,
    image_size: Optional[int] = None,
    in_channels: Optional[int] = None,
    out_channels: Optional[int] = None,
    num_train_timesteps: Optional[int] = 1000,
    num_infer_timesteps: Optional[int] = 50,
    freeze_vae_encoder: Optional[bool] = True,
    freeze_text_encoder: Optional[bool] = True,
    freeze_unet_encoder: Optional[bool] = False,
    snr_gamma: Optional[float] = 5.0,
    seed: Optional[int] = 1123,
):
    super().__init__(
        config_path=config_path,
        text_config_path=text_config_path,
        text2_config_path=text2_config_path,
        vae_config_path=vae_config_path,
        controlnet_configs_path=controlnet_configs_path,
        scheduler_config_path=scheduler_config_path,
        inpainting_controlnet_config_path=inpainting_controlnet_config_path,
        quant_config_path=quant_config_path,
        image_size=image_size,
        in_channels=in_channels,
        out_channels=out_channels,
        num_train_timesteps=num_train_timesteps,
        num_infer_timesteps=num_infer_timesteps,
        freeze_vae_encoder=freeze_vae_encoder,
        freeze_text_encoder=freeze_text_encoder,
        freeze_unet_encoder=freeze_unet_encoder,
        snr_gamma=snr_gamma,
        seed=seed,
    )
    self.pipeline = StableDiffusionXLControlNetInpaintPipeline(
        vae=self.vae,
        text_encoder=self.text,
        text_encoder_2=self.text2,
        unet=self.unet,
        controlnet=self.controlnet,
        scheduler=self.scheduler,
        tokenizer=None,
        tokenizer_2=None,
    )
    self.pipeline.set_progress_bar_config(disable=True)