vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 ¶
HunyuanImage3Pipeline ¶
Bases: HunyuanImage3PreTrainedModel, GenerationMixin, SupportImageInput, DiffusionPipelineProfilerMixin
final_layer instance-attribute ¶
final_layer = UNetUp(
patch_size=patch_size,
emb_channels=hidden_size,
in_channels=hidden_size,
hidden_channels=patch_embed_hidden_dim,
out_channels=vae["latent_channels"],
out_norm=True,
)
hf_to_vllm_mapper class-attribute instance-attribute ¶
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"model.": ""},
orig_to_new_substr={
"mlp.gate.wg.": "mlp.gate.",
"gate_and_up_proj.": "gate_up_proj.",
},
)
patch_embed instance-attribute ¶
patch_embed = UNetDown(
patch_size=patch_size,
emb_channels=hidden_size,
in_channels=vae["latent_channels"],
hidden_channels=patch_embed_hidden_dim,
out_channels=hidden_size,
)
weights_sources instance-attribute ¶
weights_sources = [
ComponentSource(
model_or_path=model,
subfolder=None,
revision=revision,
prefix="",
fall_back_to_pt=True,
)
]
forward ¶
forward(
req: OmniDiffusionRequest,
prompt: str | list[str] = "",
image_size="auto",
height: int = 1024,
width: int = 1024,
num_inference_steps: int = 50,
guidance_scale: float = 5.0,
generator: Generator | list[Generator] | None = None,
**kwargs,
) -> DiffusionOutput
forward_call ¶
forward_call(
input_ids: LongTensor | None = None,
attention_mask: Tensor | None = None,
position_ids: LongTensor | None = None,
past_key_values: list[FloatTensor] | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
custom_pos_emb: tuple[FloatTensor] | None = None,
mode: str = "gen_text",
first_step: bool | None = None,
images: BatchRaggedImages | None = None,
image_mask: Tensor | None = None,
timestep: BatchRaggedTensor | None = None,
gen_timestep_scatter_index: Tensor | None = None,
cond_vae_images: BatchRaggedImages | None = None,
cond_timestep: BatchRaggedTensor | None = None,
cond_vae_image_mask: Tensor | None = None,
cond_vit_images: BatchRaggedImages | None = None,
cond_vit_image_mask: Tensor | None = None,
vit_kwargs: dict[str, Any] | None = None,
cond_timestep_scatter_index: Tensor | None = None,
query_lens: list[int] | None = None,
seq_lens: list[int] | None = None,
num_image_tokens: int | None = None,
uncond_cfg_prefill: bool = False,
ar_kv_reuse_len: int = 0,
full_attn_spans: list[list[tuple[int, int]]]
| None = None,
) -> tuple | CausalMMOutputWithPast
inject_ar_kv_into_layers ¶
inject_ar_kv_into_layers(
ar_kv_data: dict[int, dict[str, Tensor]],
positive_reuse_len: int,
) -> None
Inject AR-stage KV cache into each layer's ImageKVCacheManager.
Truncates to positive_reuse_len and sets image_kv_cache_map directly.
instantiate_timestep_tokens ¶
instantiate_timestep_tokens(
x: Tensor,
t: BatchRaggedTensor,
timestep_scatter_index: BatchRaggedTensor,
)
instantiate_vae_image_tokens ¶
instantiate_vae_image_tokens(
x: Tensor,
images: BatchRaggedImages,
ts: BatchRaggedTensor,
image_mask: Tensor,
)
Instantiate the VAE image embeddings into the input embedding sequence. Args: x (torch.Tensor): Input sequence tensor with shape (batch_size, seq_len, n_embd). images (BatchRaggedImages): Batch of images to embed. Can be: - A 4-D tensor (batch, channels, height, width) - A list of 4-D tensors (variable number of images per batch) - A list of lists of 3-D tensors (ragged batch structure) ts (BatchRaggedTensor, optional): Timestep tensor(s) for conditioning. Can be: - A 1-D tensor (single timestep per batch) - A list of 1-D tensors (variable timesteps per batch) image_mask (torch.Tensor, optional): Boolean mask tensor with shape (batch_size, seq_len) indicating which positions should be replaced with image embeddings.
instantiate_vit_image_tokens ¶
instantiate_vit_image_tokens(
x: Tensor,
cond_vit_images: Tensor | list[Tensor],
cond_vit_image_mask: Tensor,
vit_kwargs: dict[str, Any],
)
prepare_inputs_for_generation ¶
prepare_inputs_for_generation(
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
tokenizer_output=None,
batch_gen_image_info=None,
generator=None,
**kwargs,
)
prepare_model_inputs ¶
prepare_model_inputs(
prompt=None,
mode="gen_image",
system_prompt=None,
cot_text=None,
num_inference_steps=50,
guidance_scale=5.0,
image_size="auto",
message_list=None,
device=None,
max_new_tokens=None,
**kwargs,
)
get_hunyuan_image_3_pre_process_func ¶
get_hunyuan_image_3_pre_process_func(
od_config: OmniDiffusionConfig,
)