Skip to content

vllm_omni.diffusion.models.magi_human.pipeline_magi_human

logger module-attribute

logger = getLogger(__name__)

EvalInput dataclass

audio_feat_len instance-attribute

audio_feat_len: Tensor | list[int]

audio_x_t instance-attribute

audio_x_t: Tensor

txt_feat instance-attribute

txt_feat: Tensor

txt_feat_len instance-attribute

txt_feat_len: Tensor | list[int]

x_t instance-attribute

x_t: Tensor

FlowUniPCMultistepScheduler

Bases: SchedulerMixin, ConfigMixin

begin_index property

begin_index

disable_corrector instance-attribute

disable_corrector = disable_corrector

last_sample instance-attribute

last_sample = None

lower_order_nums instance-attribute

lower_order_nums = 0

model_outputs instance-attribute

model_outputs = [None] * solver_order

num_inference_steps instance-attribute

num_inference_steps = None

order class-attribute instance-attribute

order = 1

predict_x0 instance-attribute

predict_x0 = predict_x0

sigma_max instance-attribute

sigma_max = item()

sigma_min instance-attribute

sigma_min = item()

sigmas instance-attribute

sigmas = to('cpu')

solver_p instance-attribute

solver_p = solver_p

step_index property

step_index

timestep_list instance-attribute

timestep_list = [None] * solver_order

timesteps instance-attribute

timesteps = sigmas * num_train_timesteps

add_noise

add_noise(
    original_samples: Tensor,
    noise: Tensor,
    timesteps: IntTensor,
) -> Tensor

convert_model_output

convert_model_output(
    model_output: Tensor,
    *args,
    sample: Tensor = None,
    **kwargs,
) -> Tensor

index_for_timestep

index_for_timestep(timestep, schedule_timesteps=None)

multistep_uni_c_bh_update

multistep_uni_c_bh_update(
    this_model_output: Tensor,
    *args,
    last_sample: Tensor = None,
    this_sample: Tensor = None,
    order: int | None = None,
    **kwargs,
) -> Tensor

multistep_uni_p_bh_update

multistep_uni_p_bh_update(
    model_output: Tensor,
    *args,
    sample: Tensor | None = None,
    order: int | None = None,
    **kwargs,
) -> Tensor

scale_model_input

scale_model_input(
    sample: Tensor, *args, **kwargs
) -> Tensor

set_begin_index

set_begin_index(begin_index: int = 0)

set_timesteps

set_timesteps(
    num_inference_steps: int | None = None,
    device: str | device = None,
    sigmas: list[float] | None = None,
    mu: float | None | None = None,
    shift: float | None | None = None,
)

step

step(
    model_output: Tensor,
    timestep: int | Tensor,
    sample: Tensor,
    return_dict: bool = True,
    generator=None,
) -> SchedulerOutput | tuple

step_ddim

step_ddim(
    velocity: FloatTensor,
    t: int,
    curr_state: FloatTensor,
    prev_state: FloatTensor | None = None,
    generator: Generator | None = None,
)

step_sde

step_sde(
    velocity: FloatTensor,
    t: int,
    curr_state: FloatTensor,
    noise_theta: float = 1.0,
    prev_state: FloatTensor | None = None,
    generator: Generator | None = None,
)

time_shift

time_shift(mu: float, sigma: float, t: Tensor)

MagiDataProxy

coords_style instance-attribute

coords_style = coords_style

frame_receptive_field instance-attribute

frame_receptive_field = frame_receptive_field

patch_size instance-attribute

patch_size = patch_size

ref_audio_offset instance-attribute

ref_audio_offset = ref_audio_offset

spatial_rope_interpolation instance-attribute

spatial_rope_interpolation = spatial_rope_interpolation

t_patch_size instance-attribute

t_patch_size = t_patch_size

text_offset instance-attribute

text_offset = text_offset

get_saved_data

get_saved_data(key: str)

img2tokens

img2tokens(x_t: Tensor)

process_input

process_input(transported_data: EvalInput)

process_output

process_output(x: Tensor)

saved_for_output

saved_for_output(**kwargs)

MagiHumanPipeline

Bases: Module, ProgressBarMixin, DiffusionPipelineProfilerMixin

audio_txt_guidance_scale instance-attribute

audio_txt_guidance_scale = get(
    "audio_txt_guidance_scale", 5.0
)

audio_vae instance-attribute

audio_vae = SAAudioFeatureExtractor(
    device=device,
    model_path=_resolve_subdir(
        model_path,
        "audio_vae",
        local_files_only,
        required_files=[
            "config.json",
            "model_config.json",
            "model.safetensors",
        ],
    ),
)

cfg_number instance-attribute

cfg_number = get('cfg_number', 2)

cfg_trick_start_frame instance-attribute

cfg_trick_start_frame = get('cfg_trick_start_frame', 13)

cfg_trick_value instance-attribute

cfg_trick_value = get('cfg_trick_value', 2.0)

data_proxy instance-attribute

data_proxy = MagiDataProxy(
    patch_size=get("patch_size", 2),
    t_patch_size=get("t_patch_size", 1),
    frame_receptive_field=get("frame_receptive_field", 11),
    spatial_rope_interpolation=get(
        "spatial_rope_interpolation", "extra"
    ),
    ref_audio_offset=get("ref_audio_offset", 1000),
    text_offset=get("text_offset", 0),
    coords_style=get("coords_style", "v2"),
)

device_str instance-attribute

device_str = device

dit instance-attribute

dit = DiTModel(dit_model_config)

dtype instance-attribute

dtype = dtype or bfloat16

fps instance-attribute

fps = get('fps', 25)

noise_value instance-attribute

noise_value = get('noise_value', 220)

num_inference_steps_default instance-attribute

num_inference_steps_default = get("num_inference_steps", 32)

patch_size instance-attribute

patch_size = get('patch_size', [1, 2, 2])

shift instance-attribute

shift = get('shift', 5.0)

sr_audio_noise_scale instance-attribute

sr_audio_noise_scale = get('sr_audio_noise_scale', 0.7)

sr_cfg_number instance-attribute

sr_cfg_number = get('sr_cfg_number', 2)

sr_data_proxy instance-attribute

sr_data_proxy = MagiDataProxy(
    patch_size=get("patch_size", 2),
    t_patch_size=get("t_patch_size", 1),
    frame_receptive_field=get("frame_receptive_field", 11),
    spatial_rope_interpolation=get(
        "spatial_rope_interpolation", "extra"
    ),
    ref_audio_offset=get("ref_audio_offset", 1000),
    text_offset=get("text_offset", 0),
    coords_style="v1",
)

sr_dit instance-attribute

sr_dit = DiTModel(sr_dit_model_config)

sr_num_inference_steps_default instance-attribute

sr_num_inference_steps_default = get(
    "sr_num_inference_steps", 5
)

sr_video_txt_guidance_scale instance-attribute

sr_video_txt_guidance_scale = get(
    "sr_video_txt_guidance_scale", 3.5
)

t5_gemma_target_length instance-attribute

t5_gemma_target_length = get('t5_gemma_target_length', 640)

text_encoder instance-attribute

text_encoder = _T5GemmaEncoder(
    model_path=txt_enc_path,
    device=device,
    weight_dtype=dtype,
    subfolder=txt_enc_subfolder,
)

use_cfg_trick instance-attribute

use_cfg_trick = get('use_cfg_trick', True)

using_sde_flag instance-attribute

using_sde_flag = get('using_sde_flag', False)

vae instance-attribute

vae = from_pretrained(model_path, subfolder='vae')

vae_latent_mean instance-attribute

vae_latent_mean = tensor(
    vae_cfg["latents_mean"], dtype=float32
)

vae_latent_std instance-attribute

vae_latent_std = tensor(
    vae_cfg["latents_std"], dtype=float32
)

vae_stride instance-attribute

vae_stride = get('vae_stride', [4, 16, 16])

video_processor instance-attribute

video_processor = VideoProcessor(vae_scale_factor=16)

video_txt_guidance_scale instance-attribute

video_txt_guidance_scale = get(
    "video_txt_guidance_scale", 5.0
)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model_path,
        subfolder=dit_subfolder,
        revision=None,
        prefix="dit.",
        fall_back_to_pt=True,
    ),
    ComponentSource(
        model_or_path=model_path,
        subfolder=sr_dit_subfolder,
        revision=None,
        prefix="sr_dit.",
        fall_back_to_pt=True,
    ),
]

z_dim instance-attribute

z_dim = get('z_dim', 48)

zerosnr_sigmas instance-attribute

zerosnr_sigmas = ZeroSNRDDPMDiscretization()(
    1000, do_append_zero=False, flip=True
)

encode_prompt

encode_prompt(
    prompt: str, target_length: int | None = None
) -> tuple[Tensor, int]

Encode prompt with the T5-Gemma text encoder and pad to fixed length.

This is the single text-encoder entrypoint so the runner-level prompt-embedding cache (see vllm_omni/diffusion/cache/prompt_embed_cache.py) can transparently memoize results when the same prompt is submitted repeatedly.

Returns:

Type Description
Tensor

(context, original_context_len) matching

int

func:_get_padded_t5_gemma_embedding.

forward

forward(
    req: OmniDiffusionRequest,
    prompt: str | None = None,
    height: int = 256,
    width: int = 448,
    num_inference_steps: int | None = None,
    seconds: int = 10,
    seed: int | None = None,
    image_path: str | None = None,
    audio_path: str | None = None,
    **kwargs,
) -> DiffusionOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

SAAudioFeatureExtractor

device instance-attribute

device = device

resampler instance-attribute

resampler = None

decode

decode(latents)

encode

encode(waveform)

ZeroSNRDDPMDiscretization

ZeroSNR DDPM sigma schedule, ported from daVinci-MagiHuman. Used to compute sigma values for SR noise injection.

alphas_cumprod instance-attribute

alphas_cumprod = cumprod(alphas, axis=0)

num_timesteps instance-attribute

num_timesteps = num_timesteps

post_shift instance-attribute

post_shift = post_shift

shift_scale instance-attribute

shift_scale = shift_scale

to_torch instance-attribute

to_torch = partial(tensor, dtype=float32)

get_magi_human_post_process_func

get_magi_human_post_process_func(*args, **kwargs)

get_magi_human_pre_process_func

get_magi_human_pre_process_func(*args, **kwargs)

load_audio_and_encode

load_audio_and_encode(
    audio_vae, audio_path: str, seconds: int | None = None
) -> Tensor

Load audio from file and encode to latent space using the Stable Audio VAE.