vllm_omni.diffusion.models.ltx2.pipeline_ltx2 ¶
LTX2Pipeline ¶
Bases: Module, CFGParallelMixin, ProgressBarMixin
audio_hop_length instance-attribute ¶
audio_hop_length = (
mel_hop_length
if getattr(self, "audio_vae", None) is not None
else 160
)
audio_sampling_rate instance-attribute ¶
audio_sampling_rate = (
sample_rate
if getattr(self, "audio_vae", None) is not None
else 16000
)
audio_vae_mel_compression_ratio instance-attribute ¶
audio_vae_mel_compression_ratio = (
mel_compression_ratio
if getattr(self, "audio_vae", None) is not None
else 4
)
audio_vae_temporal_compression_ratio instance-attribute ¶
audio_vae_temporal_compression_ratio = (
temporal_compression_ratio
if getattr(self, "audio_vae", None) is not None
else 4
)
scheduler instance-attribute ¶
tokenizer instance-attribute ¶
transformer instance-attribute ¶
transformer = create_transformer_from_config(
transformer_config, quant_config=quant_config
)
transformer_spatial_patch_size instance-attribute ¶
transformer_spatial_patch_size = (
patch_size
if getattr(self, "transformer", None) is not None
else 1
)
transformer_temporal_patch_size instance-attribute ¶
transformer_temporal_patch_size = (
patch_size_t
if getattr(self, "transformer", None) is not None
else 1
)
vae_spatial_compression_ratio instance-attribute ¶
vae_spatial_compression_ratio = (
spatial_compression_ratio
if getattr(self, "vae", None) is not None
else 32
)
vae_temporal_compression_ratio instance-attribute ¶
vae_temporal_compression_ratio = (
temporal_compression_ratio
if getattr(self, "vae", None) is not None
else 8
)
video_processor instance-attribute ¶
weights_sources instance-attribute ¶
weights_sources = [
ComponentSource(
model_or_path=model,
subfolder="transformer",
revision=None,
prefix="transformer.",
fall_back_to_pt=True,
)
]
check_inputs ¶
check_inputs(
prompt,
height,
width,
prompt_embeds=None,
negative_prompt_embeds=None,
prompt_attention_mask=None,
negative_prompt_attention_mask=None,
)
combine_cfg_noise ¶
Per-element CFG combine with guidance_rescale support.
encode_prompt ¶
encode_prompt(
prompt: str | list[str],
negative_prompt: str | list[str] | None = None,
do_classifier_free_guidance: bool = True,
num_videos_per_prompt: int = 1,
prompt_embeds: Tensor | None = None,
negative_prompt_embeds: Tensor | None = None,
prompt_attention_mask: Tensor | None = None,
negative_prompt_attention_mask: Tensor | None = None,
max_sequence_length: int = 1024,
scale_factor: int = 8,
device: device | None = None,
dtype: dtype | None = None,
)
forward ¶
forward(
req: OmniDiffusionRequest,
prompt: str | list[str] | None = None,
negative_prompt: str | list[str] | None = None,
height: int | None = None,
width: int | None = None,
num_frames: int | None = None,
frame_rate: float | None = None,
num_inference_steps: int | None = None,
sigmas: list[float] | None = None,
timesteps: list[int] | None = None,
guidance_scale: float = 4.0,
guidance_rescale: float = 0.0,
noise_scale: float = 0.0,
num_videos_per_prompt: int | None = 1,
generator: Generator | list[Generator] | None = None,
latents: Tensor | None = None,
audio_latents: Tensor | None = None,
prompt_embeds: Tensor | None = None,
negative_prompt_embeds: Tensor | None = None,
prompt_attention_mask: Tensor | None = None,
negative_prompt_attention_mask: Tensor | None = None,
decode_timestep: float | list[float] = 0.0,
decode_noise_scale: float | list[float] | None = None,
output_type: str = "np",
return_dict: bool = True,
attention_kwargs: dict[str, Any] | None = None,
max_sequence_length: int | None = None,
) -> DiffusionOutput
prepare_audio_latents ¶
prepare_audio_latents(
batch_size: int = 1,
num_channels_latents: int = 8,
audio_latent_length: int = 1,
num_mel_bins: int = 64,
noise_scale: float = 0.0,
dtype: dtype | None = None,
device: device | None = None,
generator: Generator | list[Generator] | None = None,
latents: Tensor | None = None,
) -> tuple[Tensor, int, int]
prepare_latents ¶
prepare_latents(
batch_size: int = 1,
num_channels_latents: int = 128,
height: int = 512,
width: int = 768,
num_frames: int = 121,
noise_scale: float = 0.0,
dtype: dtype | None = None,
device: device | None = None,
generator: Generator | None = None,
latents: Tensor | None = None,
) -> Tensor
LTX2T2VDMD2Pipeline ¶
LTX2TwoStagesPipeline ¶
Bases: Module, SupportsComponentDiscovery
LTX2TwoStagesPipeline is for two stages image to video generation
lora_manager instance-attribute ¶
lora_manager = DiffusionLoRAManager(
pipeline=pipe,
device=device,
dtype=dtype,
max_cached_adapters=max_cpu_loras,
)
upsample_pipe instance-attribute ¶
upsample_pipe = LTX2LatentUpsamplePipeline(
vae=vae, od_config=od_config
)
weights_sources instance-attribute ¶
weights_sources = [
ComponentSource(
model_or_path=model,
subfolder="transformer",
revision=None,
prefix="pipe.transformer.",
fall_back_to_pt=True,
)
]
forward ¶
forward(
req: OmniDiffusionRequest,
prompt: str | list[str] | None = None,
negative_prompt: str | list[str] | None = None,
height: int | None = None,
width: int | None = None,
num_frames: int | None = None,
frame_rate: float | None = None,
num_inference_steps: int | None = None,
timesteps: list[int] | None = None,
guidance_scale: float = 4.0,
guidance_rescale: float = 0.0,
noise_scale: float = 0.0,
num_videos_per_prompt: int | None = 1,
generator: Generator | list[Generator] | None = None,
latents: Tensor | None = None,
audio_latents: Tensor | None = None,
prompt_embeds: Tensor | None = None,
negative_prompt_embeds: Tensor | None = None,
prompt_attention_mask: Tensor | None = None,
negative_prompt_attention_mask: Tensor | None = None,
decode_timestep: float | list[float] = 0.0,
decode_noise_scale: float | list[float] | None = None,
output_type: str = "np",
return_dict: bool = True,
attention_kwargs: dict[str, Any] | None = None,
max_sequence_length: int | None = None,
)
calculate_shift ¶
calculate_shift(
image_seq_len,
base_seq_len: int = 256,
max_seq_len: int = 4096,
base_shift: float = 0.5,
max_shift: float = 1.15,
)
create_transformer_from_config ¶
create_transformer_from_config(
config: dict,
quant_config: QuantizationConfig | None = None,
) -> LTX2VideoTransformer3DModel
Create LTX2VideoTransformer3DModel from config dict.