vllm_omni.diffusion.models.helios ¶
Modules:
| Name | Description |
|---|---|
helios_transformer | |
pipeline_helios | |
scheduling_helios | |
HeliosPipeline ¶
Bases: Module, CFGParallelMixin, ProgressBarMixin, DiffusionPipelineProfilerMixin
Helios text-to-video / image-to-video / video-to-video pipeline for vllm-omni.
Supports T2V, I2V (with image input), and V2V (with video input). Implements chunked video generation with multi-term memory history context.
tokenizer instance-attribute ¶
transformer instance-attribute ¶
transformer = create_transformer_from_config(
transformer_config, quant_config=quantization_config
)
vae_scale_factor_spatial instance-attribute ¶
vae_scale_factor_spatial = (
scale_factor_spatial
if getattr(self, "vae", None)
else 8
)
vae_scale_factor_temporal instance-attribute ¶
vae_scale_factor_temporal = (
scale_factor_temporal
if getattr(self, "vae", None)
else 4
)
weights_sources instance-attribute ¶
weights_sources = [
ComponentSource(
model_or_path=model,
subfolder="transformer",
revision=None,
prefix="transformer.",
fall_back_to_pt=True,
)
]
encode_prompt ¶
encode_prompt(
prompt: str | list[str],
negative_prompt: str | list[str] | None = None,
do_classifier_free_guidance: bool = True,
num_videos_per_prompt: int = 1,
max_sequence_length: int = 226,
device: device | None = None,
dtype: dtype | None = None,
)
forward ¶
forward(
req: OmniDiffusionRequest,
prompt: str | None = None,
negative_prompt: str | None = None,
height: int = 384,
width: int = 640,
num_inference_steps: int = 50,
guidance_scale: float = 5.0,
frame_num: int = 132,
output_type: str | None = "np",
generator: Generator | list[Generator] | None = None,
prompt_embeds: Tensor | None = None,
negative_prompt_embeds: Tensor | None = None,
attention_kwargs: dict | None = None,
history_sizes: list | None = None,
num_latent_frames_per_chunk: int = 9,
keep_first_frame: bool = True,
image: Tensor | None = None,
add_noise_to_image_latents: bool = True,
image_noise_sigma_min: float = 0.111,
image_noise_sigma_max: float = 0.135,
video: Tensor | None = None,
add_noise_to_video_latents: bool = True,
video_noise_sigma_min: float = 0.111,
video_noise_sigma_max: float = 0.135,
is_enable_stage2: bool = False,
pyramid_num_stages: int = 3,
pyramid_num_inference_steps_list: list | None = None,
is_skip_first_chunk: bool = False,
is_amplify_first_chunk: bool = False,
use_cfg_zero_star: bool = False,
use_zero_init: bool = True,
zero_steps: int = 1,
**kwargs,
) -> DiffusionOutput
prepare_image_latents ¶
prepare_image_latents(
image: Tensor,
latents_mean: Tensor,
latents_std: Tensor,
num_latent_frames_per_chunk: int,
dtype: dtype | None = None,
device: device | None = None,
generator: Generator | list[Generator] | None = None,
) -> tuple[Tensor, Tensor]
Encode a single image into VAE latent space for I2V generation.
Returns (image_latents, fake_image_latents) where fake_image_latents is the last-frame latent of a repeated-frame video, used to seed the history buffer for the first denoising chunk.
prepare_latents ¶
prepare_latents(
batch_size: int,
num_channels_latents: int,
height: int,
width: int,
num_frames: int,
dtype: dtype | None,
device: device | None,
generator: Generator | list[Generator] | None,
latents: Tensor | None = None,
) -> Tensor
prepare_video_latents ¶
prepare_video_latents(
video: Tensor,
latents_mean: Tensor,
latents_std: Tensor,
num_latent_frames_per_chunk: int,
dtype: dtype | None = None,
device: device | None = None,
generator: Generator | list[Generator] | None = None,
) -> tuple[Tensor, Tensor]
Encode a video into VAE latent space for V2V generation.
Returns (first_frame_latent, video_latents) where first_frame_latent is used as the image prefix, and video_latents fills the history buffer.
sample_block_noise ¶
sample_block_noise(
batch_size,
channel,
num_frames,
height,
width,
patch_size=(1, 2, 2),
generator=None,
)
HeliosScheduler ¶
Bases: SchedulerMixin, ConfigMixin
convert_model_output ¶
convert_model_output(
model_output: Tensor,
*args,
sample: Tensor = None,
sigma: Tensor = None,
**kwargs,
) -> Tensor
multistep_uni_c_bh_update ¶
multistep_uni_c_bh_update(
this_model_output: Tensor,
*args,
last_sample: Tensor = None,
this_sample: Tensor = None,
order: int = None,
sigma_before: Tensor = None,
sigma: Tensor = None,
**kwargs,
) -> Tensor
multistep_uni_p_bh_update ¶
multistep_uni_p_bh_update(
model_output: Tensor,
*args,
sample: Tensor = None,
order: int = None,
sigma: Tensor = None,
sigma_next: Tensor = None,
**kwargs,
) -> Tensor
set_timesteps ¶
set_timesteps(
num_inference_steps: int,
stage_index: int | None = None,
device: str | device = None,
sigmas: bool | None = None,
mu: bool | None = None,
is_amplify_first_chunk: bool = False,
)
step ¶
step(
model_output: FloatTensor,
timestep: float | FloatTensor = None,
sample: FloatTensor = None,
generator: Generator | None = None,
return_dict: bool = True,
cur_sampling_step: int = 0,
dmd_noisy_tensor: FloatTensor | None = None,
dmd_sigmas: FloatTensor | None = None,
dmd_timesteps: FloatTensor | None = None,
all_timesteps: FloatTensor | None = None,
) -> HeliosSchedulerOutput | tuple
step_dmd ¶
step_dmd(
model_output: FloatTensor,
timestep: float | FloatTensor = None,
sample: FloatTensor = None,
generator: Generator | None = None,
return_dict: bool = True,
cur_sampling_step: int = 0,
dmd_noisy_tensor: FloatTensor | None = None,
dmd_sigmas: FloatTensor | None = None,
dmd_timesteps: FloatTensor | None = None,
all_timesteps: FloatTensor | None = None,
)
step_euler ¶
step_euler(
model_output: FloatTensor,
timestep: float | FloatTensor = None,
sample: FloatTensor = None,
generator: Generator | None = None,
sigma: FloatTensor | None = None,
sigma_next: FloatTensor | None = None,
return_dict: bool = True,
) -> HeliosSchedulerOutput | tuple
step_unipc ¶
step_unipc(
model_output: Tensor,
timestep: int | Tensor = None,
sample: Tensor = None,
return_dict: bool = True,
model_outputs: list = None,
timestep_list: list = None,
sigma_before: Tensor = None,
sigma: Tensor = None,
sigma_next: Tensor = None,
cus_step_index: int = None,
cus_lower_order_num: int = None,
cus_this_order: int = None,
cus_last_sample: Tensor = None,
) -> HeliosSchedulerOutput | tuple
HeliosTransformer3DModel ¶
Bases: Module
Optimized Helios Transformer model for video generation using vLLM layers.
Helios extends the Wan2.2 architecture with multi-term memory patches, guidance cross-attention, and chunked video generation support.
blocks instance-attribute ¶
blocks = ModuleList(
[
(
HeliosTransformerBlock(
inner_dim,
ffn_dim,
num_attention_heads,
eps,
cross_attn_norm,
guidance_cross_attn=guidance_cross_attn,
is_amplify_history=is_amplify_history,
history_scale_mode=history_scale_mode,
quant_config=quant_config,
)
)
for _ in (range(num_layers))
]
)
condition_embedder instance-attribute ¶
condition_embedder = HeliosTimeTextEmbedding(
dim=inner_dim,
time_freq_dim=freq_dim,
time_proj_dim=inner_dim * 6,
text_embed_dim=text_dim,
)
config instance-attribute ¶
config = type(
"Config",
(),
{
"patch_size": patch_size,
"num_attention_heads": num_attention_heads,
"attention_head_dim": attention_head_dim,
"in_channels": in_channels,
"out_channels": out_channels,
"text_dim": text_dim,
"freq_dim": freq_dim,
"ffn_dim": ffn_dim,
"num_layers": num_layers,
"cross_attn_norm": cross_attn_norm,
"qk_norm": qk_norm,
"eps": eps,
"added_kv_proj_dim": added_kv_proj_dim,
"rope_dim": rope_dim,
"rope_theta": rope_theta,
"guidance_cross_attn": guidance_cross_attn,
"zero_history_timestep": zero_history_timestep,
"has_multi_term_memory_patch": has_multi_term_memory_patch,
"is_amplify_history": is_amplify_history,
"history_scale_mode": history_scale_mode,
},
)()
has_multi_term_memory_patch instance-attribute ¶
packed_modules_mapping class-attribute instance-attribute ¶
patch_embedding instance-attribute ¶
patch_embedding = Conv3dLayer(
in_channels=in_channels,
out_channels=inner_dim,
kernel_size=patch_size,
stride=patch_size,
)
patch_long instance-attribute ¶
patch_long = Conv3dLayer(
in_channels=in_channels,
out_channels=inner_dim,
kernel_size=(4, 8, 8),
stride=(4, 8, 8),
)
patch_mid instance-attribute ¶
patch_mid = Conv3dLayer(
in_channels=in_channels,
out_channels=inner_dim,
kernel_size=(2, 4, 4),
stride=(2, 4, 4),
)
patch_short instance-attribute ¶
patch_short = Conv3dLayer(
in_channels=in_channels,
out_channels=inner_dim,
kernel_size=(1, 2, 2),
stride=(1, 2, 2),
)
forward ¶
forward(
hidden_states: Tensor,
timestep: LongTensor,
encoder_hidden_states: Tensor,
indices_hidden_states: Tensor | None = None,
indices_latents_history_short: Tensor | None = None,
indices_latents_history_mid: Tensor | None = None,
indices_latents_history_long: Tensor | None = None,
latents_history_short: Tensor | None = None,
latents_history_mid: Tensor | None = None,
latents_history_long: Tensor | None = None,
return_dict: bool = True,
attention_kwargs: dict[str, Any] | None = None,
) -> Tensor | Transformer2DModelOutput
create_transformer_from_config ¶
create_transformer_from_config(
config: dict,
quant_config: QuantizationConfig | None = None,
) -> HeliosTransformer3DModel