Skip to content

vllm_omni.diffusion.models.helios.pipeline_helios

logger module-attribute

logger = getLogger(__name__)

HeliosPipeline

Bases: Module, CFGParallelMixin, ProgressBarMixin, DiffusionPipelineProfilerMixin

Helios text-to-video / image-to-video / video-to-video pipeline for vllm-omni.

Supports T2V, I2V (with image input), and V2V (with video input). Implements chunked video generation with multi-term memory history context.

current_timestep property

current_timestep

device instance-attribute

device = get_local_device()

do_classifier_free_guidance property

do_classifier_free_guidance

guidance_scale property

guidance_scale

is_distilled instance-attribute

is_distilled = get('scheduler_type') == 'dmd'

num_timesteps property

num_timesteps

od_config instance-attribute

od_config = od_config

scheduler instance-attribute

scheduler = HeliosScheduler(**scheduler_kwargs)

text_encoder instance-attribute

text_encoder = to(device)

tokenizer instance-attribute

tokenizer = from_pretrained(
    model,
    subfolder="tokenizer",
    local_files_only=local_files_only,
)

transformer instance-attribute

transformer = create_transformer_from_config(
    transformer_config, quant_config=quantization_config
)

vae instance-attribute

vae = to(device)

vae_scale_factor_spatial instance-attribute

vae_scale_factor_spatial = (
    scale_factor_spatial
    if getattr(self, "vae", None)
    else 8
)

vae_scale_factor_temporal instance-attribute

vae_scale_factor_temporal = (
    scale_factor_temporal
    if getattr(self, "vae", None)
    else 4
)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder="transformer",
        revision=None,
        prefix="transformer.",
        fall_back_to_pt=True,
    )
]

encode_prompt

encode_prompt(
    prompt: str | list[str],
    negative_prompt: str | list[str] | None = None,
    do_classifier_free_guidance: bool = True,
    num_videos_per_prompt: int = 1,
    max_sequence_length: int = 226,
    device: device | None = None,
    dtype: dtype | None = None,
)

forward

forward(
    req: OmniDiffusionRequest,
    prompt: str | None = None,
    negative_prompt: str | None = None,
    height: int = 384,
    width: int = 640,
    num_inference_steps: int = 50,
    guidance_scale: float = 5.0,
    frame_num: int = 132,
    output_type: str | None = "np",
    generator: Generator | list[Generator] | None = None,
    prompt_embeds: Tensor | None = None,
    negative_prompt_embeds: Tensor | None = None,
    attention_kwargs: dict | None = None,
    history_sizes: list | None = None,
    num_latent_frames_per_chunk: int = 9,
    keep_first_frame: bool = True,
    image: Tensor | None = None,
    add_noise_to_image_latents: bool = True,
    image_noise_sigma_min: float = 0.111,
    image_noise_sigma_max: float = 0.135,
    video: Tensor | None = None,
    add_noise_to_video_latents: bool = True,
    video_noise_sigma_min: float = 0.111,
    video_noise_sigma_max: float = 0.135,
    is_enable_stage2: bool = False,
    pyramid_num_stages: int = 3,
    pyramid_num_inference_steps_list: list | None = None,
    is_skip_first_chunk: bool = False,
    is_amplify_first_chunk: bool = False,
    use_cfg_zero_star: bool = False,
    use_zero_init: bool = True,
    zero_steps: int = 1,
    **kwargs,
) -> DiffusionOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise

predict_noise(**kwargs: Any) -> Tensor

prepare_image_latents

prepare_image_latents(
    image: Tensor,
    latents_mean: Tensor,
    latents_std: Tensor,
    num_latent_frames_per_chunk: int,
    dtype: dtype | None = None,
    device: device | None = None,
    generator: Generator | list[Generator] | None = None,
) -> tuple[Tensor, Tensor]

Encode a single image into VAE latent space for I2V generation.

Returns (image_latents, fake_image_latents) where fake_image_latents is the last-frame latent of a repeated-frame video, used to seed the history buffer for the first denoising chunk.

prepare_latents

prepare_latents(
    batch_size: int,
    num_channels_latents: int,
    height: int,
    width: int,
    num_frames: int,
    dtype: dtype | None,
    device: device | None,
    generator: Generator | list[Generator] | None,
    latents: Tensor | None = None,
) -> Tensor

prepare_video_latents

prepare_video_latents(
    video: Tensor,
    latents_mean: Tensor,
    latents_std: Tensor,
    num_latent_frames_per_chunk: int,
    dtype: dtype | None = None,
    device: device | None = None,
    generator: Generator | list[Generator] | None = None,
) -> tuple[Tensor, Tensor]

Encode a video into VAE latent space for V2V generation.

Returns (first_frame_latent, video_latents) where first_frame_latent is used as the image prefix, and video_latents fills the history buffer.

sample_block_noise

sample_block_noise(
    batch_size,
    channel,
    num_frames,
    height,
    width,
    patch_size=(1, 2, 2),
    generator=None,
)

calculate_shift

calculate_shift(
    image_seq_len,
    base_seq_len: int = 256,
    max_seq_len: int = 4096,
    base_shift: float = 0.5,
    max_shift: float = 1.15,
)

create_transformer_from_config

create_transformer_from_config(
    config: dict,
    quant_config: QuantizationConfig | None = None,
) -> HeliosTransformer3DModel

get_helios_post_process_func

get_helios_post_process_func(
    od_config: OmniDiffusionConfig,
)

get_helios_pre_process_func

get_helios_pre_process_func(od_config: OmniDiffusionConfig)

load_json_config

load_json_config(
    model_path: str,
    subfolder: str,
    filename: str,
    local_files_only: bool = True,
) -> dict

Load a JSON config file from a local path or HuggingFace Hub repo.

load_transformer_config

load_transformer_config(
    model_path: str,
    subfolder: str = "transformer",
    local_files_only: bool = True,
) -> dict

optimized_scale

optimized_scale(positive_flat, negative_flat)