Skip to content

vllm_omni.diffusion.models.hunyuan_video

HunyuanVideo-1.5 diffusion model components (T2V and I2V).

Modules:

Name Description
hunyuan_video_15_transformer
pipeline_hunyuan_video_1_5
pipeline_hunyuan_video_1_5_i2v

HunyuanVideo15I2VPipeline

Bases: Module, CFGParallelMixin, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin

color_format class-attribute instance-attribute

color_format = 'RGB'

current_timestep property

current_timestep

device instance-attribute

device = get_local_device()

feature_extractor instance-attribute

feature_extractor = from_pretrained(
    model,
    subfolder="feature_extractor",
    local_files_only=local_files_only,
)

guidance_scale property

guidance_scale

image_encoder instance-attribute

image_encoder = to(device)

num_channels_latents instance-attribute

num_channels_latents = (
    latent_channels if hasattr(vae, "config") else 32
)

num_timesteps property

num_timesteps

od_config instance-attribute

od_config = od_config

prompt_template_encode_start_idx instance-attribute

prompt_template_encode_start_idx = 108

scheduler instance-attribute

scheduler = from_pretrained(
    model,
    subfolder="scheduler",
    local_files_only=local_files_only,
)

support_image_input class-attribute instance-attribute

support_image_input = True

system_message instance-attribute

system_message = "You are a helpful assistant. Describe the video by detailing the following aspects:         1. The main content and theme of the video.         2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.         3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.         4. background environment, light, style and atmosphere.         5. camera angles, movements, and transitions used in the video."

text_encoder instance-attribute

text_encoder = to(device)

text_encoder_2 instance-attribute

text_encoder_2 = to(dtype=dtype, device=device)

tokenizer instance-attribute

tokenizer = from_pretrained(
    model,
    subfolder="tokenizer",
    local_files_only=local_files_only,
)

tokenizer_2 instance-attribute

tokenizer_2 = from_pretrained(
    model,
    subfolder="tokenizer_2",
    local_files_only=local_files_only,
)

tokenizer_2_max_length instance-attribute

tokenizer_2_max_length = 256

tokenizer_max_length instance-attribute

tokenizer_max_length = 1000

transformer instance-attribute

transformer = HunyuanVideo15Transformer3DModel(
    od_config=od_config, **transformer_kwargs
)

use_meanflow instance-attribute

use_meanflow = getattr(
    tf_model_config, "use_meanflow", False
)

vae instance-attribute

vae = to(device)

vae_scale_factor_spatial instance-attribute

vae_scale_factor_spatial = (
    spatial_compression_ratio
    if hasattr(vae, "spatial_compression_ratio")
    else 16
)

vae_scale_factor_temporal instance-attribute

vae_scale_factor_temporal = (
    temporal_compression_ratio
    if hasattr(vae, "temporal_compression_ratio")
    else 4
)

vision_num_semantic_tokens instance-attribute

vision_num_semantic_tokens = 729

vision_states_dim instance-attribute

vision_states_dim = 1152

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder="transformer",
        revision=None,
        prefix="transformer.",
        fall_back_to_pt=True,
    ),
    ComponentSource(
        model_or_path=model,
        subfolder="text_encoder_2",
        revision=None,
        prefix="text_encoder_2.",
        fall_back_to_pt=True,
    ),
]

encode_prompt

encode_prompt(
    prompt: str | list[str],
    device: device,
    dtype: dtype,
    negative_prompt: str | list[str] | None = None,
    do_classifier_free_guidance: bool = False,
) -> tuple

forward

forward(
    req: OmniDiffusionRequest,
    num_inference_steps: int = 50,
    guidance_scale: float = 6.0,
    height: int = 480,
    width: int = 832,
    num_frames: int = 121,
    output_type: str | None = "np",
    generator: Generator | list[Generator] | None = None,
    **kwargs,
) -> DiffusionOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise

predict_noise(**kwargs: Any) -> Tensor

prepare_cond_latents_and_mask

prepare_cond_latents_and_mask(
    latents: Tensor,
    image: Image,
    height: int,
    width: int,
    dtype: dtype,
    device: device,
) -> tuple[Tensor, Tensor]

Prepare condition latents and mask for I2V: first frame = image, rest = zeros.

prepare_latents

prepare_latents(
    batch_size: int,
    height: int,
    width: int,
    num_frames: int,
    dtype: dtype,
    device: device,
    generator: Generator | list[Generator] | None = None,
    latents: Tensor | None = None,
) -> Tensor

HunyuanVideo15Pipeline

Bases: Module, CFGParallelMixin, ProgressBarMixin, DiffusionPipelineProfilerMixin

current_timestep property

current_timestep

device instance-attribute

device = get_local_device()

guidance_scale property

guidance_scale

num_channels_latents instance-attribute

num_channels_latents = (
    latent_channels if hasattr(vae, "config") else 32
)

num_timesteps property

num_timesteps

od_config instance-attribute

od_config = od_config

prompt_template_encode_start_idx instance-attribute

prompt_template_encode_start_idx = 108

scheduler instance-attribute

scheduler = from_pretrained(
    model,
    subfolder="scheduler",
    local_files_only=local_files_only,
)

system_message instance-attribute

system_message = "You are a helpful assistant. Describe the video by detailing the following aspects:         1. The main content and theme of the video.         2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.         3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.         4. background environment, light, style and atmosphere.         5. camera angles, movements, and transitions used in the video."

text_encoder instance-attribute

text_encoder = to(device)

text_encoder_2 instance-attribute

text_encoder_2 = to(dtype=dtype, device=device)

tokenizer instance-attribute

tokenizer = from_pretrained(
    model,
    subfolder="tokenizer",
    local_files_only=local_files_only,
)

tokenizer_2 instance-attribute

tokenizer_2 = from_pretrained(
    model,
    subfolder="tokenizer_2",
    local_files_only=local_files_only,
)

tokenizer_2_max_length instance-attribute

tokenizer_2_max_length = 256

tokenizer_max_length instance-attribute

tokenizer_max_length = 1000

transformer instance-attribute

transformer = HunyuanVideo15Transformer3DModel(
    od_config=od_config, **transformer_kwargs
)

use_meanflow instance-attribute

use_meanflow = getattr(
    tf_model_config, "use_meanflow", False
)

vae instance-attribute

vae = to(device)

vae_scale_factor_spatial instance-attribute

vae_scale_factor_spatial = (
    spatial_compression_ratio
    if hasattr(vae, "spatial_compression_ratio")
    else 16
)

vae_scale_factor_temporal instance-attribute

vae_scale_factor_temporal = (
    temporal_compression_ratio
    if hasattr(vae, "temporal_compression_ratio")
    else 4
)

vision_num_semantic_tokens instance-attribute

vision_num_semantic_tokens = 729

vision_states_dim instance-attribute

vision_states_dim = 1152

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder="transformer",
        revision=None,
        prefix="transformer.",
        fall_back_to_pt=True,
    ),
    ComponentSource(
        model_or_path=model,
        subfolder="text_encoder_2",
        revision=None,
        prefix="text_encoder_2.",
        fall_back_to_pt=True,
    ),
]

encode_prompt

encode_prompt(
    prompt: str | list[str],
    device: device,
    dtype: dtype,
    negative_prompt: str | list[str] | None = None,
    do_classifier_free_guidance: bool = False,
) -> tuple[
    Tensor,
    Tensor,
    Tensor,
    Tensor,
    Tensor | None,
    Tensor | None,
    Tensor | None,
    Tensor | None,
]

forward

forward(
    req: OmniDiffusionRequest,
    num_inference_steps: int = 50,
    guidance_scale: float = 6.0,
    height: int = 480,
    width: int = 832,
    num_frames: int = 121,
    output_type: str | None = "np",
    generator: Generator | list[Generator] | None = None,
    **kwargs,
) -> DiffusionOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise

predict_noise(**kwargs: Any) -> Tensor

prepare_cond_latents_and_mask

prepare_cond_latents_and_mask(
    latents: Tensor, dtype: dtype, device: device
) -> tuple[Tensor, Tensor]

Prepare zero condition latents and mask for T2V mode.

prepare_latents

prepare_latents(
    batch_size: int,
    height: int,
    width: int,
    num_frames: int,
    dtype: dtype,
    device: device,
    generator: Generator | list[Generator] | None = None,
    latents: Tensor | None = None,
) -> Tensor

HunyuanVideo15Transformer3DModel

Bases: Module

HunyuanVideo-1.5 Transformer with TP-optimized dual-stream attention.

Ported from diffusers HunyuanVideo15Transformer3DModel with vllm-omni tensor-parallel layers for the 54 main transformer blocks.

cond_type_embed instance-attribute

cond_type_embed = Embedding(3, inner_dim)

context_embedder instance-attribute

context_embedder = HunyuanVideo15TokenRefiner(
    text_embed_dim,
    num_attention_heads,
    attention_head_dim,
    num_layers=num_refiner_layers,
)

context_embedder_2 instance-attribute

context_embedder_2 = HunyuanVideo15ByT5TextProjection(
    text_embed_2_dim, 2048, inner_dim
)

image_embedder instance-attribute

image_embedder = HunyuanVideo15ImageProjection(
    image_embed_dim, inner_dim
)

in_channels instance-attribute

in_channels = in_channels

inner_dim instance-attribute

inner_dim = inner_dim

norm_out instance-attribute

norm_out = AdaLayerNormContinuous(
    inner_dim,
    inner_dim,
    elementwise_affine=False,
    eps=1e-06,
)

out_channels instance-attribute

out_channels = out_channels or in_channels

packed_modules_mapping class-attribute instance-attribute

packed_modules_mapping = {
    "to_qkv": ["to_q", "to_k", "to_v"],
    "add_kv_proj": [
        "add_q_proj",
        "add_k_proj",
        "add_v_proj",
    ],
}

parallel_config instance-attribute

parallel_config = parallel_config

patch_size instance-attribute

patch_size = patch_size

patch_size_t instance-attribute

patch_size_t = patch_size_t

proj_out instance-attribute

proj_out = Linear(
    inner_dim,
    patch_size_t * patch_size * patch_size * out_channels,
)

rope instance-attribute

rope = HunyuanVideo15RotaryPosEmbed(
    patch_size,
    patch_size_t,
    list(rope_axes_dim),
    rope_theta,
)

time_embed instance-attribute

time_embed = HunyuanVideo15TimeEmbedding(
    inner_dim, use_meanflow=use_meanflow
)

transformer_blocks instance-attribute

transformer_blocks = ModuleList(
    [
        (
            HunyuanVideo15TransformerBlock(
                num_attention_heads,
                attention_head_dim,
                mlp_ratio=mlp_ratio,
                qk_norm=qk_norm,
            )
        )
        for _ in (range(num_layers))
    ]
)

x_embedder instance-attribute

x_embedder = HunyuanVideo15PatchEmbed(
    (patch_size_t, patch_size, patch_size),
    in_channels,
    inner_dim,
)

forward

forward(
    hidden_states: Tensor,
    timestep: LongTensor,
    encoder_hidden_states: Tensor,
    encoder_attention_mask: Tensor,
    timestep_r: LongTensor | None = None,
    encoder_hidden_states_2: Tensor | None = None,
    encoder_attention_mask_2: Tensor | None = None,
    image_embeds: Tensor | None = None,
    image_embeds_mask: Tensor | None = None,
    attention_kwargs: dict[str, Any] | None = None,
    return_dict: bool = True,
) -> Tensor | Transformer2DModelOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

get_hunyuan_video_15_i2v_post_process_func

get_hunyuan_video_15_i2v_post_process_func(
    od_config: OmniDiffusionConfig,
)

get_hunyuan_video_15_i2v_pre_process_func

get_hunyuan_video_15_i2v_pre_process_func(
    od_config: OmniDiffusionConfig,
)

Pre-process function for I2V: load and resize image.

get_hunyuan_video_15_post_process_func

get_hunyuan_video_15_post_process_func(
    od_config: OmniDiffusionConfig,
)