Skip to content

vllm_omni.diffusion.models.cosmos3.audio_tokenizer.avae

Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation.

logger module-attribute

logger = init_logger(__name__)

Cosmos3AVAEAudioTokenizer

Bases: Module

Decoder-only AVAE tokenizer for Cosmos3 audio latents.

audio_channels instance-attribute

audio_channels = int(
    _config_get(
        config,
        "dec_out_channels",
        "audio_channels",
        default=2
        if bool(get("stereo", audio_channels == 2))
        else 1,
    )
)

decoder instance-attribute

decoder = OobleckDecoder(
    channels=int(
        _config_get(config, "dec_dim", default=320)
    ),
    input_channels=latent_ch,
    audio_channels=audio_channels,
    upsampling_ratios=list(reversed(dec_strides)),
    channel_multiples=list(
        _config_get(
            config, "dec_c_mults", default=[1, 2, 4, 8, 16]
        )
    ),
)

device instance-attribute

device = device(device)

dtype instance-attribute

dtype = dtype

hop_size instance-attribute

hop_size = int(
    _config_get(
        config,
        "hop_size",
        default=prod(dec_strides)
        if dec_strides
        else hop_size,
    )
)

latent_ch instance-attribute

latent_ch = int(
    _config_get(
        config,
        "vocoder_input_dim",
        "io_channels",
        "latent_ch",
        default=io_channels,
    )
)

normalization_type instance-attribute

normalization_type = normalization_type

sample_rate instance-attribute

sample_rate = int(
    _config_get(
        config,
        "sampling_rate",
        "sample_rate",
        default=sample_rate,
    )
)

tanh_clamp instance-attribute

tanh_clamp = float(
    _config_get(config, "tanh_clamp", default=tanh_clamp)
)

tanh_input_scale instance-attribute

tanh_input_scale = float(
    _config_get(
        config, "tanh_input_scale", default=tanh_input_scale
    )
)

tanh_output_scale instance-attribute

tanh_output_scale = float(
    _config_get(
        config,
        "tanh_output_scale",
        default=tanh_output_scale,
    )
)

temporal_compression_factor property

temporal_compression_factor: int

decode

decode(latent: Tensor) -> Tensor

encode

encode(audio: Tensor, force_pad: bool = False) -> Tensor

get_audio_num_samples

get_audio_num_samples(num_latent_samples: int) -> int

get_latent_num_samples

get_latent_num_samples(num_audio_samples: int) -> int

OobleckDecoder

Bases: Module

Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents.

block instance-attribute

block = ModuleList(block)

conv1 instance-attribute

conv1 = weight_norm(
    Conv1d(
        input_channels,
        channels * channel_multiples[-1],
        kernel_size=7,
        padding=3,
    )
)

conv2 instance-attribute

conv2 = weight_norm(
    Conv1d(
        channels,
        audio_channels,
        kernel_size=7,
        padding=3,
        bias=False,
    )
)

snake1 instance-attribute

snake1 = Snake1d(channels)

forward

forward(hidden_state: Tensor) -> Tensor

OobleckDecoderBlock

Bases: Module

Decoder block used by the diffusers Oobleck decoder.

conv_t1 instance-attribute

conv_t1 = weight_norm(
    ConvTranspose1d(
        input_dim,
        output_dim,
        kernel_size=2 * stride,
        stride=stride,
        padding=ceil(stride / 2),
        output_padding=output_padding,
    )
)

res_unit1 instance-attribute

res_unit1 = OobleckResidualUnit(output_dim, dilation=1)

res_unit2 instance-attribute

res_unit2 = OobleckResidualUnit(output_dim, dilation=3)

res_unit3 instance-attribute

res_unit3 = OobleckResidualUnit(output_dim, dilation=9)

snake1 instance-attribute

snake1 = Snake1d(input_dim)

forward

forward(hidden_state: Tensor) -> Tensor

OobleckResidualUnit

Bases: Module

Residual unit used by the diffusers Oobleck decoder.

conv1 instance-attribute

conv1 = weight_norm(
    Conv1d(
        dimension,
        dimension,
        kernel_size=7,
        dilation=dilation,
        padding=pad,
    )
)

conv2 instance-attribute

conv2 = weight_norm(
    Conv1d(dimension, dimension, kernel_size=1)
)

snake1 instance-attribute

snake1 = Snake1d(dimension)

snake2 instance-attribute

snake2 = Snake1d(dimension)

forward

forward(hidden_state: Tensor) -> Tensor

Snake1d

Bases: Module

One-dimensional Snake activation matching diffusers' Oobleck layout.

alpha instance-attribute

alpha = Parameter(zeros(1, hidden_dim, 1))

beta instance-attribute

beta = Parameter(zeros(1, hidden_dim, 1))

logscale instance-attribute

logscale = logscale

forward

forward(hidden_states: Tensor) -> Tensor