Skip to content

vllm_omni.diffusion.models.audiox.pipeline_audiox

logger module-attribute

logger = init_logger(__name__)

prepare_audio_reference module-attribute

prepare_audio_reference = prepare_audio_reference

prepare_video_reference module-attribute

prepare_video_reference = prepare_video_reference

AudioVaePromptAdapter

Bases: Module

pretransform instance-attribute

pretransform = _build_audiox_oobleck()

proj_features_128 instance-attribute

proj_features_128 = Linear(latent_seq_len, 128)

proj_out instance-attribute

proj_out = (
    Linear(in_ch, cond_dim)
    if in_ch != cond_dim
    else Identity()
)

forward

forward(audio: Tensor) -> tuple[Tensor, Tensor]

AudioXPipeline

Bases: Module, SupportAudioOutput, DiffusionPipelineProfilerMixin

audio_channels class-attribute

audio_channels: int = 2

audio_sample_rate class-attribute

audio_sample_rate: int = 44100

audio_vae_adapter instance-attribute

audio_vae_adapter = AudioVaePromptAdapter(
    cond_dim=int(model_config["conditioning"]["cond_dim"]),
    latent_seq_len=int(
        cond_configs["audio_prompt"]["latent_seq_len"]
    ),
)

clip_empty_visual_feat instance-attribute

clip_empty_visual_feat = Parameter(
    zeros(1, _clip_out_features, _DIM), requires_grad=False
)

clip_encoder instance-attribute

clip_encoder = CLIPVisionModelWithProjection(vision_config)

clip_proj instance-attribute

clip_proj = Linear(_in_features, _clip_out_features)

clip_proj_sync instance-attribute

clip_proj_sync = Linear(240, _clip_out_features)

clip_sync_weight instance-attribute

clip_sync_weight = Parameter(tensor(0.0))

clip_temp_pos_embedding instance-attribute

clip_temp_pos_embedding = Parameter(
    randn(1, _VIDEO_FPS * _DURATION_SEC, _DIM)
)

clip_temp_transformer instance-attribute

clip_temp_transformer = SA_Transformer(
    _DIM, depth=4, heads=16, dim_head=64, mlp_dim=_DIM * 4
)

device instance-attribute

device = get_local_device()

diffusion_objective instance-attribute

diffusion_objective = 'v'

io_channels instance-attribute

io_channels = model_config['io_channels']

maf_block instance-attribute

maf_block = MAF_Block(
    dim=768,
    num_experts_per_modality=int(
        gate_type_config["num_experts_per_modality"]
    ),
    num_heads=int(gate_type_config["num_heads"]),
    num_fusion_layers=int(
        gate_type_config["num_fusion_layers"]
    ),
)

model instance-attribute

model = MMDiffusionTransformer(
    **(dict(diffusion_config["config"]))
)

od_config instance-attribute

od_config = od_config

pretransform instance-attribute

pretransform = _build_audiox_oobleck(
    scaling_factor=float(get("scale", 1.0))
)

support_audio_output class-attribute

support_audio_output: bool = True

text_encoder instance-attribute

text_encoder = to(float16)

tokenizer instance-attribute

tokenizer = from_pretrained(t5_name)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=_model_root,
        subfolder="transformer",
        revision=getattr(od_config, "revision", None),
        prefix="",
    )
]

diffuse

diffuse(
    *,
    steps: int,
    guidance_scale: float,
    conditioning_tensors: dict[str, Any],
    negative_conditioning_tensors: dict[str, Any] | None,
    batch_size: int,
    sigma_min: float,
    sigma_max: float,
    generator: Generator,
    cfg_rescale: float,
) -> Tensor

forward

get_conditioning_inputs

get_conditioning_inputs(
    conditioning_tensors: dict[str, Any],
    negative: bool = False,
) -> dict[str, Any]

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

MAF_Block

Bases: Module

DIM class-attribute instance-attribute

DIM = 768

MLP_RATIO class-attribute instance-attribute

MLP_RATIO = 4.0

bypass_gate_a instance-attribute

bypass_gate_a = Parameter(tensor(-10.0))

bypass_gate_t instance-attribute

bypass_gate_t = Parameter(tensor(-10.0))

bypass_gate_v instance-attribute

bypass_gate_v = Parameter(tensor(-10.0))

cross_block instance-attribute

cross_block = _MAFCrossAttentionBlock(dim, num_heads)

fusion_blocks instance-attribute

fusion_blocks = ModuleList(
    [
        (_MAFFusionBlock(dim, num_heads, mlp_ratio))
        for _ in (range(num_fusion_layers))
    ]
)

gating_network instance-attribute

gating_network = Sequential(
    Linear(dim * 3, dim), GELU(), Linear(dim, 3), Sigmoid()
)

norm1 instance-attribute

norm1 = LayerNorm(dim)

norm_a2 instance-attribute

norm_a2 = LayerNorm(dim)

norm_t2 instance-attribute

norm_t2 = LayerNorm(dim)

norm_v2 instance-attribute

norm_v2 = LayerNorm(dim)

unified_experts instance-attribute

unified_experts = Parameter(randn(total_experts, dim))

forward

forward(
    video_tokens: Tensor,
    text_tokens: Tensor,
    audio_tokens: Tensor,
) -> dict[str, Tensor]

SA_Attention

Bases: Module

heads instance-attribute

heads = heads

scale instance-attribute

scale = dim_head ** -0.5

to_out instance-attribute

to_out = (
    Sequential(Linear(inner_dim, dim), Dropout(0.0))
    if project_out
    else Identity()
)

to_qkv instance-attribute

to_qkv = Linear(dim, inner_dim * 3, bias=False)

forward

forward(x)

SA_FeedForward

Bases: Module

net instance-attribute

net = Sequential(
    Linear(dim, hidden_dim),
    GELU(),
    Dropout(0.0),
    Linear(hidden_dim, dim),
    Dropout(0.0),
)

forward

forward(x)

SA_PreNorm

Bases: Module

fn instance-attribute

fn = fn

norm instance-attribute

norm = LayerNorm(dim)

forward

forward(x, **kwargs)

SA_Transformer

Bases: Module

layers instance-attribute

layers = ModuleList([])

norm instance-attribute

norm = LayerNorm(dim)

forward

forward(x)

get_audiox_post_process_func

get_audiox_post_process_func(
    od_config: OmniDiffusionConfig,
)

Convert the pipeline's float audio tensor to a CPU numpy array for serving.