Skip to content

vllm_omni.diffusion.models.hunyuan_image3.autoencoder

AttnBlock

Bases: Module

Attention with torch sdpa implementation.

in_channels instance-attribute

in_channels = in_channels

k instance-attribute

k = Conv3d(in_channels, in_channels, kernel_size=1)

norm instance-attribute

norm = GroupNorm(
    num_groups=32,
    num_channels=in_channels,
    eps=1e-06,
    affine=True,
)

proj_out instance-attribute

proj_out = Conv3d(in_channels, in_channels, kernel_size=1)

q instance-attribute

q = Conv3d(in_channels, in_channels, kernel_size=1)

v instance-attribute

v = Conv3d(in_channels, in_channels, kernel_size=1)

attention

attention(h_: Tensor) -> Tensor

forward

forward(x: Tensor) -> Tensor

AutoencoderKLConv3D

Bases: ModelMixin, ConfigMixin

Autoencoder model with KL-regularized latent space based on 3D convolutions.

decoder instance-attribute

decoder = Decoder(
    z_channels=latent_channels,
    out_channels=out_channels,
    block_out_channels=list(reversed(block_out_channels)),
    num_res_blocks=layers_per_block,
    ffactor_spatial=ffactor_spatial,
    ffactor_temporal=ffactor_temporal,
    upsample_match_channel=upsample_match_channel,
)

encoder instance-attribute

encoder = Encoder(
    in_channels=in_channels,
    z_channels=latent_channels,
    block_out_channels=block_out_channels,
    num_res_blocks=layers_per_block,
    ffactor_spatial=ffactor_spatial,
    ffactor_temporal=ffactor_temporal,
    downsample_match_channel=downsample_match_channel,
)

ffactor_spatial instance-attribute

ffactor_spatial = ffactor_spatial

ffactor_temporal instance-attribute

ffactor_temporal = ffactor_temporal

scaling_factor instance-attribute

scaling_factor = scaling_factor

shift_factor instance-attribute

shift_factor = shift_factor

slicing_bsz instance-attribute

slicing_bsz = 1

tile_latent_min_size instance-attribute

tile_latent_min_size = sample_size // ffactor_spatial

tile_latent_min_tsize instance-attribute

tile_latent_min_tsize = sample_tsize // ffactor_temporal

tile_overlap_factor instance-attribute

tile_overlap_factor = 0.25

tile_sample_min_size instance-attribute

tile_sample_min_size = sample_size

tile_sample_min_tsize instance-attribute

tile_sample_min_tsize = sample_tsize

use_compile instance-attribute

use_compile = False

use_slicing instance-attribute

use_slicing = False

use_spatial_tiling instance-attribute

use_spatial_tiling = False

use_temporal_tiling instance-attribute

use_temporal_tiling = False

use_tiling_during_training instance-attribute

use_tiling_during_training = False

blend_h

blend_h(a: Tensor, b: Tensor, blend_extent: int)

blend_t

blend_t(a: Tensor, b: Tensor, blend_extent: int)

blend_v

blend_v(a: Tensor, b: Tensor, blend_extent: int)

decode

decode(z: Tensor, return_dict: bool = True, generator=None)

Decodes the input by passing through the decoder network. Support slicing and tiling for memory efficiency.

encode

encode(x: Tensor, return_dict: bool = True)

Encodes the input by passing through the encoder network. Support slicing and tiling for memory efficiency.

spatial_tiled_decode

spatial_tiled_decode(z: Tensor)

spatial tailing for frames

spatial_tiled_encode

spatial_tiled_encode(x: Tensor)

spatial tailing for frames

temporal_tiled_decode

temporal_tiled_decode(z: Tensor)

temporal tailing for frames

temporal_tiled_encode

temporal_tiled_encode(x: Tensor)

temporal tailing for frames

Conv3d

Bases: Conv3d

Perform Conv3d on patches with numerical differences from nn.Conv3d within 1e-5. Only symmetric padding is supported.

forward

forward(input)

Decoder

Bases: Module

The decoder network of AutoencoderKLConv3D.

block_out_channels instance-attribute

block_out_channels = block_out_channels

conv_in instance-attribute

conv_in = Conv3d(
    z_channels, block_in, kernel_size=3, stride=1, padding=1
)

conv_out instance-attribute

conv_out = Conv3d(
    block_in,
    out_channels,
    kernel_size=3,
    stride=1,
    padding=1,
)

gradient_checkpointing instance-attribute

gradient_checkpointing = False

mid instance-attribute

mid = Module()

norm_out instance-attribute

norm_out = GroupNorm(
    num_groups=32,
    num_channels=block_in,
    eps=1e-06,
    affine=True,
)

num_res_blocks instance-attribute

num_res_blocks = num_res_blocks

up instance-attribute

up = ModuleList()

z_channels instance-attribute

z_channels = z_channels

forward

forward(z: Tensor) -> Tensor

DecoderOutput dataclass

Bases: BaseOutput

posterior class-attribute instance-attribute

posterior: DiagonalGaussianDistribution | None = None

sample instance-attribute

sample: FloatTensor

DiagonalGaussianDistribution

deterministic instance-attribute

deterministic = deterministic

logvar instance-attribute

logvar = clamp(logvar, -30.0, 20.0)

parameters instance-attribute

parameters = parameters

std instance-attribute

std = exp(0.5 * logvar)

var instance-attribute

var = exp(logvar)

sample

sample(generator: Generator | None = None) -> FloatTensor

DownsampleDCAE

Bases: Module

add_temporal_downsample instance-attribute

add_temporal_downsample = add_temporal_downsample

conv instance-attribute

conv = Conv3d(
    in_channels,
    out_channels // factor,
    kernel_size=3,
    stride=1,
    padding=1,
)

group_size instance-attribute

group_size = factor * in_channels // out_channels

forward

forward(x: Tensor)

Encoder

Bases: Module

The encoder network of AutoencoderKLConv3D.

block_out_channels instance-attribute

block_out_channels = block_out_channels

conv_in instance-attribute

conv_in = Conv3d(
    in_channels,
    block_out_channels[0],
    kernel_size=3,
    stride=1,
    padding=1,
)

conv_out instance-attribute

conv_out = Conv3d(
    block_in,
    2 * z_channels,
    kernel_size=3,
    stride=1,
    padding=1,
)

down instance-attribute

down = ModuleList()

gradient_checkpointing instance-attribute

gradient_checkpointing = False

mid instance-attribute

mid = Module()

norm_out instance-attribute

norm_out = GroupNorm(
    num_groups=32,
    num_channels=block_in,
    eps=1e-06,
    affine=True,
)

num_res_blocks instance-attribute

num_res_blocks = num_res_blocks

z_channels instance-attribute

z_channels = z_channels

forward

forward(x: Tensor) -> Tensor

ResnetBlock

Bases: Module

conv1 instance-attribute

conv1 = Conv3d(
    in_channels,
    out_channels,
    kernel_size=3,
    stride=1,
    padding=1,
)

conv2 instance-attribute

conv2 = Conv3d(
    out_channels,
    out_channels,
    kernel_size=3,
    stride=1,
    padding=1,
)

in_channels instance-attribute

in_channels = in_channels

nin_shortcut instance-attribute

nin_shortcut = Conv3d(
    in_channels,
    out_channels,
    kernel_size=1,
    stride=1,
    padding=0,
)

norm1 instance-attribute

norm1 = GroupNorm(
    num_groups=32,
    num_channels=in_channels,
    eps=1e-06,
    affine=True,
)

norm2 instance-attribute

norm2 = GroupNorm(
    num_groups=32,
    num_channels=out_channels,
    eps=1e-06,
    affine=True,
)

out_channels instance-attribute

out_channels = out_channels

forward

forward(x)

UpsampleDCAE

Bases: Module

add_temporal_upsample instance-attribute

add_temporal_upsample = add_temporal_upsample

conv instance-attribute

conv = Conv3d(
    in_channels,
    out_channels * factor,
    kernel_size=3,
    stride=1,
    padding=1,
)

repeats instance-attribute

repeats = factor * out_channels // in_channels

forward

forward(x: Tensor)

forward_with_checkpointing

forward_with_checkpointing(
    module, *inputs, use_checkpointing=False
)

swish

swish(x: Tensor) -> Tensor