Skip to content

vllm_omni.diffusion.models.ltx2.pipeline_ltx2_3

Fully independent LTX-2.3 pipeline for vLLM-Omni.

This pipeline does NOT inherit from LTX2Pipeline because: - LTX-2.3 connectors run per_token_rms_norm + per-modality video/audio projection internally (per_modality_projections=True), versus LTX-2's per_layer_masked_mean_norm + shared projection path - LTX-2.3 uses a BWE vocoder outputting 48kHz audio (not 16kHz) - LTX-2.3 transformer requires the sigma parameter for prompt modulation - CPU offloading is required for the 22B transformer (~44GB VRAM)

logger module-attribute

logger = init_logger(__name__)

LTX23ImageToVideoPipeline

Bases: Module

LTX-2.3 image-to-video pipeline placeholder.

LTX23Pipeline

Bases: Module, CFGParallelMixin, ProgressBarMixin

Fully independent LTX-2.3 pipeline.

Key differences from LTX2Pipeline: - Text encoding: uses ALL 49 hidden states from Gemma-3-12B, flattened - Connectors: uses padding_side API (not additive_mask) - Vocoder: uses LTX2VocoderWithBWE (48kHz output) - Transformer: passes sigma for prompt_adaln - CPU offloading: text encoder, connectors, VAE, vocoder stay on CPU

audio_hop_length instance-attribute

audio_hop_length = (
    mel_hop_length if audio_vae is not None else 160
)

audio_sampling_rate instance-attribute

audio_sampling_rate = (
    sample_rate if audio_vae is not None else 16000
)

audio_vae instance-attribute

audio_vae = from_pretrained_with_prefetch(
    from_pretrained,
    model,
    subfolder="audio_vae",
    prefetch_list=ltx2_subfolders,
    local_files_only=local_files_only,
    torch_dtype=dtype,
)

audio_vae_mel_compression_ratio instance-attribute

audio_vae_mel_compression_ratio = (
    mel_compression_ratio if audio_vae is not None else 4
)

audio_vae_temporal_compression_ratio instance-attribute

audio_vae_temporal_compression_ratio = (
    temporal_compression_ratio
    if audio_vae is not None
    else 4
)

connectors instance-attribute

connectors = from_pretrained_with_prefetch(
    from_pretrained,
    model,
    subfolder="connectors",
    prefetch_list=ltx2_subfolders,
    local_files_only=local_files_only,
    torch_dtype=dtype,
)

current_timestep property

current_timestep

device instance-attribute

device = get_local_device()

do_classifier_free_guidance property

do_classifier_free_guidance

dummy_run_num_frames class-attribute instance-attribute

dummy_run_num_frames = 2

guidance_scale property

guidance_scale

interrupt property

interrupt

num_timesteps property

num_timesteps

od_config instance-attribute

od_config = od_config

scheduler instance-attribute

scheduler = from_pretrained(
    model,
    subfolder="scheduler",
    local_files_only=local_files_only,
)

text_encoder instance-attribute

text_encoder = from_pretrained_with_prefetch(
    from_pretrained,
    model,
    subfolder="text_encoder",
    prefetch_list=ltx2_subfolders,
    local_files_only=local_files_only,
    torch_dtype=dtype,
)

tokenizer instance-attribute

tokenizer = from_pretrained(
    model,
    subfolder="tokenizer",
    local_files_only=local_files_only,
)

tokenizer_max_length instance-attribute

tokenizer_max_length = int(tokenizer_max_length)

transformer instance-attribute

transformer = create_transformer_from_config(
    transformer_config, quant_config=quant_config
)

transformer_spatial_patch_size instance-attribute

transformer_spatial_patch_size = (
    patch_size if transformer is not None else 1
)

transformer_temporal_patch_size instance-attribute

transformer_temporal_patch_size = (
    patch_size_t if transformer is not None else 1
)

vae instance-attribute

vae = from_pretrained_with_prefetch(
    from_pretrained,
    model,
    subfolder="vae",
    prefetch_list=ltx2_subfolders,
    local_files_only=local_files_only,
    torch_dtype=dtype,
)

vae_spatial_compression_ratio instance-attribute

vae_spatial_compression_ratio = (
    spatial_compression_ratio if vae is not None else 32
)

vae_temporal_compression_ratio instance-attribute

vae_temporal_compression_ratio = (
    temporal_compression_ratio if vae is not None else 8
)

video_processor instance-attribute

video_processor = VideoProcessor(
    vae_scale_factor=vae_spatial_compression_ratio
)

vocoder instance-attribute

vocoder = from_pretrained(
    model,
    subfolder="vocoder",
    torch_dtype=dtype,
    local_files_only=local_files_only,
)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder="transformer",
        revision=None,
        prefix="transformer.",
        fall_back_to_pt=True,
    )
]

check_inputs

check_inputs(
    prompt,
    height,
    width,
    prompt_embeds=None,
    negative_prompt_embeds=None,
    prompt_attention_mask=None,
    negative_prompt_attention_mask=None,
)

combine_cfg_noise

combine_cfg_noise(
    positive_noise_pred,
    negative_noise_pred,
    true_cfg_scale,
    cfg_normalize=False,
    *,
    video_latents: Tensor | None = None,
    audio_latents: Tensor | None = None,
    video_sigma: Tensor | None = None,
    audio_sigma: Tensor | None = None,
)

encode_prompt

encode_prompt(
    prompt: str | list[str],
    negative_prompt: str | list[str] | None = None,
    do_classifier_free_guidance: bool = True,
    num_videos_per_prompt: int = 1,
    prompt_embeds: Tensor | None = None,
    negative_prompt_embeds: Tensor | None = None,
    prompt_attention_mask: Tensor | None = None,
    negative_prompt_attention_mask: Tensor | None = None,
    max_sequence_length: int = 1024,
    device: device | None = None,
    dtype: dtype | None = None,
)

forward

forward(
    req: OmniDiffusionRequest,
    prompt: str | list[str] | None = None,
    negative_prompt: str | list[str] | None = None,
    height: int | None = None,
    width: int | None = None,
    num_frames: int | None = None,
    frame_rate: float | None = None,
    num_inference_steps: int | None = None,
    sigmas: list[float] | None = None,
    timesteps: list[int] | None = None,
    guidance_scale: float = 4.0,
    noise_scale: float = 0.0,
    num_videos_per_prompt: int | None = 1,
    generator: Generator | list[Generator] | None = None,
    latents: Tensor | None = None,
    audio_latents: Tensor | None = None,
    prompt_embeds: Tensor | None = None,
    negative_prompt_embeds: Tensor | None = None,
    prompt_attention_mask: Tensor | None = None,
    negative_prompt_attention_mask: Tensor | None = None,
    decode_timestep: float | list[float] = 0.0,
    decode_noise_scale: float | list[float] | None = None,
    output_type: str = "np",
    return_dict: bool = True,
    attention_kwargs: dict[str, Any] | None = None,
    max_sequence_length: int | None = None,
) -> DiffusionOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise

predict_noise(**kwargs)

predict_noise_with_parallel_cfg

predict_noise_with_parallel_cfg(
    true_cfg_scale: float,
    positive_kwargs: dict[str, Any],
    negative_kwargs: dict[str, Any],
    cfg_normalize: bool = True,
    output_slice: int | None = None,
    *,
    video_latents: Tensor | None = None,
    audio_latents: Tensor | None = None,
    video_sigma: Tensor | None = None,
    audio_sigma: Tensor | None = None,
) -> tuple[Tensor, Tensor]

prepare_audio_latents

prepare_audio_latents(
    batch_size: int = 1,
    num_channels_latents: int = 8,
    audio_latent_length: int = 1,
    num_mel_bins: int = 64,
    noise_scale: float = 0.0,
    dtype: dtype | None = None,
    device: device | None = None,
    generator: Generator | list[Generator] | None = None,
    latents: Tensor | None = None,
) -> tuple[Tensor, int, int]

prepare_latents

prepare_latents(
    batch_size: int = 1,
    num_channels_latents: int = 128,
    height: int = 512,
    width: int = 768,
    num_frames: int = 121,
    noise_scale: float = 0.0,
    dtype: dtype | None = None,
    device: device | None = None,
    generator: Generator | None = None,
    latents: Tensor | None = None,
) -> Tensor

get_ltx2_post_process_func

get_ltx2_post_process_func(od_config: OmniDiffusionConfig)

Factory for the LTX-2.3 post-process function.

Detects the vocoder output sample rate at factory time and captures it in the closure so that the audio_sample_rate flows through DiffusionEngine -> OmniRequestOutput -> serving_video.