vllm_omni.diffusion.models.audiox.pipeline_audiox ¶
AudioXPipeline ¶
Bases: Module, SupportAudioOutput, DiffusionPipelineProfilerMixin
audio_vae_adapter instance-attribute ¶
audio_vae_adapter = AudioVaePromptAdapter(
cond_dim=int(model_config["conditioning"]["cond_dim"]),
latent_seq_len=int(
cond_configs["audio_prompt"]["latent_seq_len"]
),
)
clip_empty_visual_feat instance-attribute ¶
clip_temp_pos_embedding instance-attribute ¶
clip_temp_transformer instance-attribute ¶
clip_temp_transformer = SA_Transformer(
_DIM, depth=4, heads=16, dim_head=64, mlp_dim=_DIM * 4
)
maf_block instance-attribute ¶
maf_block = MAF_Block(
dim=768,
num_experts_per_modality=int(
gate_type_config["num_experts_per_modality"]
),
num_heads=int(gate_type_config["num_heads"]),
num_fusion_layers=int(
gate_type_config["num_fusion_layers"]
),
)
pretransform instance-attribute ¶
pretransform = _build_audiox_oobleck(
scaling_factor=float(get("scale", 1.0))
)
weights_sources instance-attribute ¶
weights_sources = [
ComponentSource(
model_or_path=_model_root,
subfolder="transformer",
revision=getattr(od_config, "revision", None),
prefix="",
)
]
diffuse ¶
diffuse(
*,
steps: int,
guidance_scale: float,
conditioning_tensors: dict[str, Any],
negative_conditioning_tensors: dict[str, Any] | None,
batch_size: int,
sigma_min: float,
sigma_max: float,
generator: Generator,
cfg_rescale: float,
) -> Tensor
MAF_Block ¶
SA_FeedForward ¶
SA_PreNorm ¶
SA_Transformer ¶
get_audiox_post_process_func ¶
get_audiox_post_process_func(
od_config: OmniDiffusionConfig,
)
Convert the pipeline's float audio tensor to a CPU numpy array for serving.