vllm_omni.diffusion.models.audiox.pipeline_audiox ¶
prepare_audio_reference module-attribute ¶
prepare_audio_reference = (
_audiox_transforms.prepare_audio_reference
)
prepare_video_reference module-attribute ¶
prepare_video_reference = (
_audiox_transforms.prepare_video_reference
)
AudioXPipeline ¶
Bases: Module, SupportAudioOutput, DiffusionPipelineProfilerMixin
audio_vae_adapter instance-attribute ¶
audio_vae_adapter = AudioVaePromptAdapter(
cond_dim=int(model_config["conditioning"]["cond_dim"]),
latent_seq_len=int(
cond_configs["audio_prompt"]["latent_seq_len"]
),
)
clip_empty_visual_feat instance-attribute ¶
clip_empty_visual_feat = nn.Parameter(
torch.zeros(1, self._clip_out_features, _DIM),
requires_grad=False,
)
clip_encoder instance-attribute ¶
clip_temp_pos_embedding instance-attribute ¶
clip_temp_transformer instance-attribute ¶
clip_temp_transformer = SA_Transformer(
_DIM, depth=4, heads=16, dim_head=64, mlp_dim=_DIM * 4
)
maf_block instance-attribute ¶
maf_block = MAF_Block(
dim=768,
num_experts_per_modality=int(
gate_type_config["num_experts_per_modality"]
),
num_heads=int(gate_type_config["num_heads"]),
num_fusion_layers=int(
gate_type_config["num_fusion_layers"]
),
)
pretransform instance-attribute ¶
pretransform = _build_audiox_oobleck(
scaling_factor=float(
model_config["pretransform"].get("scale", 1.0)
)
)
text_encoder instance-attribute ¶
text_encoder = (
T5EncoderModel(t5_config)
.train(False)
.requires_grad_(False)
.to(torch.float16)
)
weights_sources instance-attribute ¶
weights_sources = [
DiffusersPipelineLoader.ComponentSource(
model_or_path=self._model_root,
subfolder="transformer",
revision=getattr(od_config, "revision", None),
prefix="",
)
]
diffuse ¶
diffuse(
*,
steps: int,
guidance_scale: float,
conditioning_tensors: dict[str, Any],
negative_conditioning_tensors: dict[str, Any] | None,
batch_size: int,
sigma_min: float,
sigma_max: float,
generator: Generator,
cfg_rescale: float,
) -> Tensor
MAF_Block ¶
Bases: Module
fusion_blocks instance-attribute ¶
fusion_blocks = nn.ModuleList(
[
(_MAFFusionBlock(dim, num_heads, mlp_ratio))
for _ in (range(num_fusion_layers))
]
)
gating_network instance-attribute ¶
gating_network = nn.Sequential(
nn.Linear(dim * 3, dim),
nn.GELU(),
nn.Linear(dim, 3),
nn.Sigmoid(),
)
unified_experts instance-attribute ¶
SA_Attention ¶
Bases: Module
to_out instance-attribute ¶
to_out = (
nn.Sequential(
nn.Linear(inner_dim, dim), nn.Dropout(0.0)
)
if project_out
else nn.Identity()
)
SA_FeedForward ¶
SA_PreNorm ¶
SA_Transformer ¶
get_audiox_post_process_func ¶
get_audiox_post_process_func(
od_config: OmniDiffusionConfig,
)
Convert the pipeline's float audio tensor to a CPU numpy array for serving.