vllm_omni.diffusion.models.dreamid_omni.pipeline_dreamid_omni ¶

AUDIO_CONFIG `module-attribute` ¶

AUDIO_CONFIG = {
    "patch_size": [1],
    "model_type": "t2a",
    "dim": 3072,
    "ffn_dim": 14336,
    "freq_dim": 256,
    "num_heads": 24,
    "num_layers": 30,
    "in_dim": 20,
    "out_dim": 20,
    "text_len": 512,
    "window_size": [-1, -1],
    "qk_norm": True,
    "cross_attn_norm": True,
    "eps": 1e-06,
    "temporal_rope_scaling_factor": 0.19676,
}

VIDEO_CONFIG `module-attribute` ¶

VIDEO_CONFIG = {
    "patch_size": [1, 2, 2],
    "model_type": "ti2v",
    "dim": 3072,
    "ffn_dim": 14336,
    "freq_dim": 256,
    "num_heads": 24,
    "num_layers": 30,
    "in_dim": 48,
    "out_dim": 48,
    "text_len": 512,
    "window_size": [-1, -1],
    "qk_norm": True,
    "cross_attn_norm": True,
    "eps": 1e-06,
}

logger `module-attribute` ¶

logger = logging.getLogger(__name__)

DreamIDOmniPipeline ¶

Bases: Module, CFGParallelMixin, SupportImageInput, SupportAudioInput, SupportsComponentDiscovery

DreamID-Omni pipeline for vLLM-Omni.

audio_cfg_scale `instance-attribute` ¶

audio_cfg_scale = 4.0

audio_latent_channel `instance-attribute` ¶

audio_latent_channel = AUDIO_CONFIG.get('in_dim')

audio_latent_length `instance-attribute` ¶

audio_latent_length = 157

audio_ref_cfg_scale `instance-attribute` ¶

audio_ref_cfg_scale = 2.0

device `instance-attribute` ¶

device = get_local_device()

model `instance-attribute` ¶

model = FusionModel(
    VIDEO_CONFIG, AUDIO_CONFIG, quant_config=quant_config
)

od_config `instance-attribute` ¶

od_config = od_config

parallel_config `instance-attribute` ¶

parallel_config = od_config.parallel_config

scheduler_audio `instance-attribute` ¶

scheduler_audio = None

scheduler_video `instance-attribute` ¶

scheduler_video = None

target_area `instance-attribute` ¶

target_area = 960 * 960

target_dtype `instance-attribute` ¶

target_dtype = od_config.dtype

text_encoder `instance-attribute` ¶

text_encoder = self.text_model.model

text_model `instance-attribute` ¶

text_model = init_text_model(model, rank=self.device)

transformer `instance-attribute` ¶

transformer = self.model

vae `instance-attribute` ¶

vae = self.vae_model_video

vae_model_audio `instance-attribute` ¶

vae_model_audio = vae_model_audio.bfloat16()

vae_model_video `instance-attribute` ¶

vae_model_video = vae_model_video

video_cfg_scale `instance-attribute` ¶

video_cfg_scale = 3.0

video_latent_channel `instance-attribute` ¶

video_latent_channel = VIDEO_CONFIG.get('in_dim')

video_latent_length `instance-attribute` ¶

video_latent_length = 31

video_ref_cfg_scale `instance-attribute` ¶

video_ref_cfg_scale = 1.5

weights_sources `instance-attribute` ¶

weights_sources = [
    DiffusersPipelineLoader.ComponentSource(
        model_or_path=model,
        subfolder=fusion_subfolder,
        revision=None,
        prefix="model.",
        allow_patterns_overrides=[fusion_filename],
    )
]

combine_multi_branch_cfg_noise ¶

combine_multi_branch_cfg_noise(
    predictions, true_cfg_scale, cfg_normalize=False
)

diffuse ¶

diffuse(
    video_noise: Tensor,
    audio_noise: Tensor,
    latents_ref_image: Tensor,
    latents_ref_audio: Tensor,
    timesteps_video: Tensor,
    timesteps_audio: Tensor,
    text_embeddings_video_pos: Tensor,
    text_embeddings_video_neg: Tensor,
    text_embeddings_audio_pos: Tensor,
    text_embeddings_audio_neg: Tensor,
    max_seq_len_video: int,
    max_seq_len_audio: int,
    freqs_scaling_tensor: Tensor,
    ref_ip_num: int,
    ref_audio_length: int,
    ref_audio_lengths: list,
    scheduler_video,
    scheduler_audio,
) -> tuple[Tensor, Tensor]

Diffusion loop with CFG parallel support for DreamID-Omni.

encode_prompt ¶

encode_prompt(
    prompt: str,
    video_negative_prompt: str = "",
    audio_negative_prompt: str = "",
) -> tuple[Tensor, Tensor, Tensor, Tensor]

Encode the positive and negative prompts via self.text_model.

This is the single text-encoder entrypoint so that the runner-level prompt-embedding cache (see vllm_omni/diffusion/cache/prompt_embed_cache.py) can transparently memoize results when the same prompts are submitted repeatedly (e.g. GRPO rollouts that sample the same prompt with different seeds).

Returns:

Type	Description
`Tensor`	(audio_pos, video_pos, video_neg, audio_neg) embeddings cast to
`Tensor`	`self.target_dtype`. `audio_pos` and `video_pos` are the
`Tensor`	same tensor, matching the original inline behavior.

forward ¶

forward(
    request: DiffusionRequestBatch, **kwargs
) -> DiffusionOutput

Main forward pass for DreamID-Omni pipeline for R2AV task.

get_scheduler_time_steps ¶

get_scheduler_time_steps(
    sampling_steps, solver_name="unipc", device=0, shift=5.0
)

load_image_latent_ref_ip_video ¶

load_image_latent_ref_ip_video(
    images, audios, video_frame_height_width
)

load_weights ¶

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

predict_noise ¶

predict_noise(**kwargs)

get_dreamid_omni_post_process_func ¶

get_dreamid_omni_post_process_func(*args, **kwargs)

vllm_omni.diffusion.models.dreamid_omni.pipeline_dreamid_omni ¶

AUDIO_CONFIG module-attribute ¶

VIDEO_CONFIG module-attribute ¶

logger module-attribute ¶

DreamIDOmniPipeline ¶

audio_cfg_scale instance-attribute ¶

audio_latent_channel instance-attribute ¶

audio_latent_length instance-attribute ¶

audio_ref_cfg_scale instance-attribute ¶

device instance-attribute ¶

model instance-attribute ¶

od_config instance-attribute ¶

parallel_config instance-attribute ¶

scheduler_audio instance-attribute ¶

scheduler_video instance-attribute ¶

target_area instance-attribute ¶

target_dtype instance-attribute ¶

text_encoder instance-attribute ¶

text_model instance-attribute ¶

transformer instance-attribute ¶

vae instance-attribute ¶

vae_model_audio instance-attribute ¶

vae_model_video instance-attribute ¶

video_cfg_scale instance-attribute ¶

video_latent_channel instance-attribute ¶

video_latent_length instance-attribute ¶

video_ref_cfg_scale instance-attribute ¶

weights_sources instance-attribute ¶