vllm_omni.diffusion.models.audiox.audiox_transformer ¶
AudioXCrossAttention ¶
Bases: Module
attn instance-attribute ¶
attn = Attention(
num_heads=local_nheads,
head_size=head_dim,
softmax_scale=head_dim**-0.5,
causal=False,
)
to_kv instance-attribute ¶
to_kv = MergedColumnParallelLinear(
input_size=dim,
output_sizes=[dim, dim],
bias=False,
gather_output=False,
prefix=f"{prefix}.to_kv",
)
to_q instance-attribute ¶
AudioXMMConvFeedForward ¶
Bases: Module
w1 instance-attribute ¶
w1 = _ColumnParallelChannelLastConv1d(
dim,
hidden_dim,
bias=False,
kernel_size=kernel_size,
padding=padding,
)
w2 instance-attribute ¶
w2 = _RowParallelChannelLastConv1d(
hidden_dim,
dim,
bias=False,
kernel_size=kernel_size,
padding=padding,
)
w3 instance-attribute ¶
w3 = _ColumnParallelChannelLastConv1d(
dim,
hidden_dim,
bias=False,
kernel_size=kernel_size,
padding=padding,
)
AudioXMMDiTBlock ¶
Bases: Module
adaLN_modulation instance-attribute ¶
cross_attn instance-attribute ¶
cross_attn = AudioXCrossAttention(
dim, nhead, prefix=f"{prefix}.cross_attn"
)
ffn instance-attribute ¶
ffn = AudioXMMConvFeedForward(
dim, int(dim * mlp_ratio), kernel_size=3, padding=1
)
linear1 instance-attribute ¶
linear1 = AudioXMMChannelLastConv1d(
dim, dim, kernel_size=3, padding=1
)
forward ¶
forward(
x: Tensor,
cond: Tensor,
rot: tuple[Tensor, Tensor] | None,
context: Tensor = None,
) -> Tensor
AudioXMMDiTSelfAttention ¶
AudioXRMSNorm ¶
ContinuousMMDiTTransformer ¶
Bases: Module
layers instance-attribute ¶
layers = ModuleList(
[
(
AudioXMMDiTBlock(
hidden_dim,
num_heads,
mlp_ratio=mlp_ratio,
prefix=f"layers.{i}",
)
)
for i in (range(depth))
]
)
proj_mm_seq_len instance-attribute ¶
proj_mm_tokens instance-attribute ¶
project_in instance-attribute ¶
project_out instance-attribute ¶
MMDiffusionTransformer ¶
Bases: Module
AudioX MMDiT, specialized for the published bundle (zhangj1an/AudioX).
The bundle fixes patch_size=1, transformer_type="continuous_transformer", cond_token_dim=768 (>0, project_cond_tokens=False), and never sets prepend_cond_dim or input_concat_dim, so those code paths are removed.
postprocess_conv instance-attribute ¶
preprocess_conv instance-attribute ¶
timestep_features instance-attribute ¶
timestep_features = GaussianFourierProjection(
in_features=1,
embedding_size=timestep_features_dim // 2,
scale=1.0,
trainable=False,
)
to_cond_embed instance-attribute ¶
to_cond_embed = Sequential(
Linear(cond_token_dim, cond_embed_dim, bias=False),
SiLU(),
Linear(cond_embed_dim, cond_embed_dim, bias=False),
)
to_global_embed instance-attribute ¶
to_global_embed = Sequential(
Linear(global_cond_dim, global_embed_dim, bias=False),
SiLU(),
Linear(global_embed_dim, global_embed_dim, bias=False),
)
to_timestep_embed instance-attribute ¶
to_timestep_embed = Sequential(
Linear(timestep_features_dim, embed_dim, bias=True),
SiLU(),
Linear(embed_dim, embed_dim, bias=True),
)
transformer instance-attribute ¶
transformer = ContinuousMMDiTTransformer(
dim=embed_dim,
depth=depth,
dim_heads=embed_dim // num_heads,
dim_in=io_channels,
dim_out=io_channels,
)