Skip to content

vllm_omni.diffusion.models.dreamid_omni.pipeline_dreamid_omni

AUDIO_CONFIG module-attribute

AUDIO_CONFIG = {
    "patch_size": [1],
    "model_type": "t2a",
    "dim": 3072,
    "ffn_dim": 14336,
    "freq_dim": 256,
    "num_heads": 24,
    "num_layers": 30,
    "in_dim": 20,
    "out_dim": 20,
    "text_len": 512,
    "window_size": [-1, -1],
    "qk_norm": True,
    "cross_attn_norm": True,
    "eps": 1e-06,
    "temporal_rope_scaling_factor": 0.19676,
}

VIDEO_CONFIG module-attribute

VIDEO_CONFIG = {
    "patch_size": [1, 2, 2],
    "model_type": "ti2v",
    "dim": 3072,
    "ffn_dim": 14336,
    "freq_dim": 256,
    "num_heads": 24,
    "num_layers": 30,
    "in_dim": 48,
    "out_dim": 48,
    "text_len": 512,
    "window_size": [-1, -1],
    "qk_norm": True,
    "cross_attn_norm": True,
    "eps": 1e-06,
}

logger module-attribute

logger = getLogger(__name__)

DreamIDOmniPipeline

Bases: Module, CFGParallelMixin, SupportImageInput, SupportAudioInput

DreamID-Omni pipeline for vLLM-Omni.

audio_cfg_scale instance-attribute

audio_cfg_scale = 4.0

audio_latent_channel instance-attribute

audio_latent_channel = get('in_dim')

audio_latent_length instance-attribute

audio_latent_length = 157

audio_ref_cfg_scale instance-attribute

audio_ref_cfg_scale = 2.0

device instance-attribute

device = get_local_device()

model instance-attribute

model = FusionModel(
    VIDEO_CONFIG, AUDIO_CONFIG, quant_config=quant_config
)

od_config instance-attribute

od_config = od_config

parallel_config instance-attribute

parallel_config = parallel_config

scheduler_audio instance-attribute

scheduler_audio = None

scheduler_video instance-attribute

scheduler_video = None

target_area instance-attribute

target_area = 960 * 960

target_dtype instance-attribute

target_dtype = dtype

text_encoder instance-attribute

text_encoder = model

text_model instance-attribute

text_model = init_text_model(model, rank=device)

transformer instance-attribute

transformer = model

vae instance-attribute

vae = vae_model_video

vae_model_audio instance-attribute

vae_model_audio = bfloat16()

vae_model_video instance-attribute

vae_model_video = vae_model_video

video_cfg_scale instance-attribute

video_cfg_scale = 3.0

video_latent_channel instance-attribute

video_latent_channel = get('in_dim')

video_latent_length instance-attribute

video_latent_length = 31

video_ref_cfg_scale instance-attribute

video_ref_cfg_scale = 1.5

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder=fusion_subfolder,
        revision=None,
        prefix="model.",
        allow_patterns_overrides=[fusion_filename],
    )
]

combine_multi_branch_cfg_noise

combine_multi_branch_cfg_noise(
    predictions, true_cfg_scale, cfg_normalize=False
)

diffuse

diffuse(
    video_noise: Tensor,
    audio_noise: Tensor,
    latents_ref_image: Tensor,
    latents_ref_audio: Tensor,
    timesteps_video: Tensor,
    timesteps_audio: Tensor,
    text_embeddings_video_pos: Tensor,
    text_embeddings_video_neg: Tensor,
    text_embeddings_audio_pos: Tensor,
    text_embeddings_audio_neg: Tensor,
    max_seq_len_video: int,
    max_seq_len_audio: int,
    freqs_scaling_tensor: Tensor,
    ref_ip_num: int,
    ref_audio_length: int,
    ref_audio_lengths: list,
    scheduler_video,
    scheduler_audio,
) -> tuple[Tensor, Tensor]

Diffusion loop with CFG parallel support for DreamID-Omni.

encode_prompt

encode_prompt(
    prompt: str,
    video_negative_prompt: str = "",
    audio_negative_prompt: str = "",
) -> tuple[Tensor, Tensor, Tensor, Tensor]

Encode the positive and negative prompts via self.text_model.

This is the single text-encoder entrypoint so that the runner-level prompt-embedding cache (see vllm_omni/diffusion/cache/prompt_embed_cache.py) can transparently memoize results when the same prompts are submitted repeatedly (e.g. GRPO rollouts that sample the same prompt with different seeds).

Returns:

Type Description
Tensor

(audio_pos, video_pos, video_neg, audio_neg) embeddings cast to

Tensor

self.target_dtype. audio_pos and video_pos are the

Tensor

same tensor, matching the original inline behavior.

forward

forward(
    request: OmniDiffusionRequest, **kwargs
) -> DiffusionOutput

Main forward pass for DreamID-Omni pipeline for R2AV task.

get_scheduler_time_steps

get_scheduler_time_steps(
    sampling_steps, solver_name="unipc", device=0, shift=5.0
)

load_image_latent_ref_ip_video

load_image_latent_ref_ip_video(
    images, audios, video_frame_height_width
)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise

predict_noise(**kwargs)

get_dreamid_omni_post_process_func

get_dreamid_omni_post_process_func(*args, **kwargs)