Skip to content

vllm_omni.diffusion.models.audiox

Modules:

Name Description
audiox_transformer
pipeline_audiox

AudioXPipeline

Bases: Module, SupportAudioOutput, DiffusionPipelineProfilerMixin

audio_channels class-attribute

audio_channels: int = 2

audio_sample_rate class-attribute

audio_sample_rate: int = 44100

audio_vae_adapter instance-attribute

audio_vae_adapter = AudioVaePromptAdapter(
    cond_dim=int(model_config["conditioning"]["cond_dim"]),
    latent_seq_len=int(
        cond_configs["audio_prompt"]["latent_seq_len"]
    ),
)

clip_empty_visual_feat instance-attribute

clip_empty_visual_feat = Parameter(
    zeros(1, _clip_out_features, _DIM), requires_grad=False
)

clip_encoder instance-attribute

clip_encoder = CLIPVisionModelWithProjection(vision_config)

clip_proj instance-attribute

clip_proj = Linear(_in_features, _clip_out_features)

clip_proj_sync instance-attribute

clip_proj_sync = Linear(240, _clip_out_features)

clip_sync_weight instance-attribute

clip_sync_weight = Parameter(tensor(0.0))

clip_temp_pos_embedding instance-attribute

clip_temp_pos_embedding = Parameter(
    randn(1, _VIDEO_FPS * _DURATION_SEC, _DIM)
)

clip_temp_transformer instance-attribute

clip_temp_transformer = SA_Transformer(
    _DIM, depth=4, heads=16, dim_head=64, mlp_dim=_DIM * 4
)

device instance-attribute

device = get_local_device()

diffusion_objective instance-attribute

diffusion_objective = 'v'

io_channels instance-attribute

io_channels = model_config['io_channels']

maf_block instance-attribute

maf_block = MAF_Block(
    dim=768,
    num_experts_per_modality=int(
        gate_type_config["num_experts_per_modality"]
    ),
    num_heads=int(gate_type_config["num_heads"]),
    num_fusion_layers=int(
        gate_type_config["num_fusion_layers"]
    ),
)

model instance-attribute

model = MMDiffusionTransformer(
    **(dict(diffusion_config["config"]))
)

od_config instance-attribute

od_config = od_config

pretransform instance-attribute

pretransform = _build_audiox_oobleck(
    scaling_factor=float(get("scale", 1.0))
)

support_audio_output class-attribute

support_audio_output: bool = True

text_encoder instance-attribute

text_encoder = to(float16)

tokenizer instance-attribute

tokenizer = from_pretrained(t5_name)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=_model_root,
        subfolder="transformer",
        revision=getattr(od_config, "revision", None),
        prefix="",
    )
]

diffuse

diffuse(
    *,
    steps: int,
    guidance_scale: float,
    conditioning_tensors: dict[str, Any],
    negative_conditioning_tensors: dict[str, Any] | None,
    batch_size: int,
    sigma_min: float,
    sigma_max: float,
    generator: Generator,
    cfg_rescale: float,
) -> Tensor

forward

get_conditioning_inputs

get_conditioning_inputs(
    conditioning_tensors: dict[str, Any],
    negative: bool = False,
) -> dict[str, Any]

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

MMDiffusionTransformer

Bases: Module

AudioX MMDiT, specialized for the published bundle (zhangj1an/AudioX).

The bundle fixes patch_size=1, transformer_type="continuous_transformer", cond_token_dim=768 (>0, project_cond_tokens=False), and never sets prepend_cond_dim or input_concat_dim, so those code paths are removed.

cond_token_dim instance-attribute

cond_token_dim = cond_token_dim

postprocess_conv instance-attribute

postprocess_conv = Conv1d(
    io_channels, io_channels, 1, bias=False
)

preprocess_conv instance-attribute

preprocess_conv = Conv1d(
    io_channels, io_channels, 1, bias=False
)

timestep_features instance-attribute

timestep_features = GaussianFourierProjection(
    in_features=1,
    embedding_size=timestep_features_dim // 2,
    scale=1.0,
    trainable=False,
)

to_cond_embed instance-attribute

to_cond_embed = Sequential(
    Linear(cond_token_dim, cond_embed_dim, bias=False),
    SiLU(),
    Linear(cond_embed_dim, cond_embed_dim, bias=False),
)

to_global_embed instance-attribute

to_global_embed = Sequential(
    Linear(global_cond_dim, global_embed_dim, bias=False),
    SiLU(),
    Linear(global_embed_dim, global_embed_dim, bias=False),
)

to_timestep_embed instance-attribute

to_timestep_embed = Sequential(
    Linear(timestep_features_dim, embed_dim, bias=True),
    SiLU(),
    Linear(embed_dim, embed_dim, bias=True),
)

transformer instance-attribute

transformer = ContinuousMMDiTTransformer(
    dim=embed_dim,
    depth=depth,
    dim_heads=embed_dim // num_heads,
    dim_in=io_channels,
    dim_out=io_channels,
)

forward

forward(
    x,
    t,
    cross_attn_cond,
    negative_cross_attn_cond=None,
    negative_cross_attn_mask=None,
    cfg_scale: float = 1.0,
    scale_phi: float = 0.0,
    **kwargs,
)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

get_audiox_post_process_func

get_audiox_post_process_func(
    od_config: OmniDiffusionConfig,
)

Convert the pipeline's float audio tensor to a CPU numpy array for serving.