Skip to content

vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3

BatchRaggedImages module-attribute

BatchRaggedImages = Tensor | list[Tensor | list[Tensor]]

BatchRaggedTensor module-attribute

BatchRaggedTensor = Tensor | list[Tensor]

logger module-attribute

logger = getLogger(__name__)

HunyuanImage3Pipeline

Bases: HunyuanImage3PreTrainedModel, GenerationMixin, SupportImageInput, DiffusionPipelineProfilerMixin

final_layer instance-attribute

final_layer = UNetUp(
    patch_size=patch_size,
    emb_channels=hidden_size,
    in_channels=hidden_size,
    hidden_channels=patch_embed_hidden_dim,
    out_channels=vae["latent_channels"],
    out_norm=True,
)

generation_config instance-attribute

generation_config = from_pretrained(model)

hf_config instance-attribute

hf_config = get_config(model, trust_remote_code=True)

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={"model.": ""},
    orig_to_new_substr={
        "mlp.gate.wg.": "mlp.gate.",
        "gate_and_up_proj.": "gate_up_proj.",
    },
)

image_processor instance-attribute

image_processor = HunyuanImage3ImageProcessor(hf_config)

lm_head instance-attribute

lm_head = Linear(hidden_size, vocab_size, bias=False)

model instance-attribute

model = HunyuanImage3Model(
    hf_config, quant_config=quant_config
)

od_config instance-attribute

od_config = od_config

patch_embed instance-attribute

patch_embed = UNetDown(
    patch_size=patch_size,
    emb_channels=hidden_size,
    in_channels=vae["latent_channels"],
    hidden_channels=patch_embed_hidden_dim,
    out_channels=hidden_size,
)

pipeline property

pipeline

support_image_input class-attribute instance-attribute

support_image_input = True

time_embed instance-attribute

time_embed = TimestepEmbedder(hidden_size=hidden_size)

time_embed_2 instance-attribute

time_embed_2 = TimestepEmbedder(hidden_size=hidden_size)

timestep_emb instance-attribute

timestep_emb = TimestepEmbedder(hidden_size=hidden_size)

transformer instance-attribute

transformer = model

vae instance-attribute

vae = from_config(vae)

vision_aligner instance-attribute

vision_aligner = LightProjector(vit_aligner)

vision_model instance-attribute

vision_model = Siglip2VisionTransformer(vit)

vllm_config instance-attribute

vllm_config = get_current_vllm_config()

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder=None,
        revision=revision,
        prefix="",
        fall_back_to_pt=True,
    )
]

build_batch_rope_image_info staticmethod

build_batch_rope_image_info(output, sections)

check_inputs staticmethod

check_inputs(prompt=None, message_list=None)

forward

forward(
    req: OmniDiffusionRequest,
    prompt: str | list[str] = "",
    image_size="auto",
    height: int = 1024,
    width: int = 1024,
    num_inference_steps: int = 50,
    guidance_scale: float = 5.0,
    generator: Generator | list[Generator] | None = None,
    **kwargs,
) -> DiffusionOutput

forward_call

forward_call(
    input_ids: LongTensor | None = None,
    attention_mask: Tensor | None = None,
    position_ids: LongTensor | None = None,
    past_key_values: list[FloatTensor] | None = None,
    use_cache: bool | None = None,
    output_attentions: bool | None = None,
    output_hidden_states: bool | None = None,
    return_dict: bool | None = None,
    custom_pos_emb: tuple[FloatTensor] | None = None,
    mode: str = "gen_text",
    first_step: bool | None = None,
    images: BatchRaggedImages | None = None,
    image_mask: Tensor | None = None,
    timestep: BatchRaggedTensor | None = None,
    gen_timestep_scatter_index: Tensor | None = None,
    cond_vae_images: BatchRaggedImages | None = None,
    cond_timestep: BatchRaggedTensor | None = None,
    cond_vae_image_mask: Tensor | None = None,
    cond_vit_images: BatchRaggedImages | None = None,
    cond_vit_image_mask: Tensor | None = None,
    vit_kwargs: dict[str, Any] | None = None,
    cond_timestep_scatter_index: Tensor | None = None,
    query_lens: list[int] | None = None,
    seq_lens: list[int] | None = None,
    num_image_tokens: int | None = None,
    uncond_cfg_prefill: bool = False,
    ar_kv_reuse_len: int = 0,
    full_attn_spans: list[list[tuple[int, int]]]
    | None = None,
) -> tuple | CausalMMOutputWithPast

get_pos_emb staticmethod

get_pos_emb(custom_pos_emb, position_ids)

inject_ar_kv_into_layers

inject_ar_kv_into_layers(
    ar_kv_data: dict[int, dict[str, Tensor]],
    positive_reuse_len: int,
) -> None

Inject AR-stage KV cache into each layer's ImageKVCacheManager.

Truncates to positive_reuse_len and sets image_kv_cache_map directly.

instantiate_timestep_tokens

instantiate_timestep_tokens(
    x: Tensor,
    t: BatchRaggedTensor,
    timestep_scatter_index: BatchRaggedTensor,
)

instantiate_vae_image_tokens

instantiate_vae_image_tokens(
    x: Tensor,
    images: BatchRaggedImages,
    ts: BatchRaggedTensor,
    image_mask: Tensor,
)

Instantiate the VAE image embeddings into the input embedding sequence. Args: x (torch.Tensor): Input sequence tensor with shape (batch_size, seq_len, n_embd). images (BatchRaggedImages): Batch of images to embed. Can be: - A 4-D tensor (batch, channels, height, width) - A list of 4-D tensors (variable number of images per batch) - A list of lists of 3-D tensors (ragged batch structure) ts (BatchRaggedTensor, optional): Timestep tensor(s) for conditioning. Can be: - A 1-D tensor (single timestep per batch) - A list of 1-D tensors (variable timesteps per batch) image_mask (torch.Tensor, optional): Boolean mask tensor with shape (batch_size, seq_len) indicating which positions should be replaced with image embeddings.

instantiate_vit_image_tokens

instantiate_vit_image_tokens(
    x: Tensor,
    cond_vit_images: Tensor | list[Tensor],
    cond_vit_image_mask: Tensor,
    vit_kwargs: dict[str, Any],
)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

prepare_inputs_for_generation

prepare_inputs_for_generation(
    input_ids,
    past_key_values=None,
    attention_mask=None,
    inputs_embeds=None,
    tokenizer_output=None,
    batch_gen_image_info=None,
    generator=None,
    **kwargs,
)

prepare_model_inputs

prepare_model_inputs(
    prompt=None,
    mode="gen_image",
    system_prompt=None,
    cot_text=None,
    num_inference_steps=50,
    guidance_scale=5.0,
    image_size="auto",
    message_list=None,
    device=None,
    max_new_tokens=None,
    **kwargs,
)

prepare_seed

prepare_seed(seed=None, batch_size=1)

ragged_final_layer

ragged_final_layer(
    x, image_mask, timestep, token_h, token_w, first_step
)

vae_encode

vae_encode(image, cfg_factor=1, generator=None)

default

default(val, d)

get_hunyuan_image_3_pre_process_func

get_hunyuan_image_3_pre_process_func(
    od_config: OmniDiffusionConfig,
)

to_device

to_device(data, device)