Skip to content

vllm_omni.diffusion.models.cosmos3.audio_tokenizer

Modules:

Name Description
avae

Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation.

Cosmos3AVAEAudioTokenizer

Bases: Module

Decoder-only AVAE tokenizer for Cosmos3 audio latents.

audio_channels instance-attribute

audio_channels = int(
    _config_get(
        config,
        "dec_out_channels",
        "audio_channels",
        default=2
        if bool(get("stereo", audio_channels == 2))
        else 1,
    )
)

decoder instance-attribute

decoder = OobleckDecoder(
    channels=int(
        _config_get(config, "dec_dim", default=320)
    ),
    input_channels=latent_ch,
    audio_channels=audio_channels,
    upsampling_ratios=list(reversed(dec_strides)),
    channel_multiples=list(
        _config_get(
            config, "dec_c_mults", default=[1, 2, 4, 8, 16]
        )
    ),
)

device instance-attribute

device = device(device)

dtype instance-attribute

dtype = dtype

hop_size instance-attribute

hop_size = int(
    _config_get(
        config,
        "hop_size",
        default=prod(dec_strides)
        if dec_strides
        else hop_size,
    )
)

latent_ch instance-attribute

latent_ch = int(
    _config_get(
        config,
        "vocoder_input_dim",
        "io_channels",
        "latent_ch",
        default=io_channels,
    )
)

normalization_type instance-attribute

normalization_type = normalization_type

sample_rate instance-attribute

sample_rate = int(
    _config_get(
        config,
        "sampling_rate",
        "sample_rate",
        default=sample_rate,
    )
)

tanh_clamp instance-attribute

tanh_clamp = float(
    _config_get(config, "tanh_clamp", default=tanh_clamp)
)

tanh_input_scale instance-attribute

tanh_input_scale = float(
    _config_get(
        config, "tanh_input_scale", default=tanh_input_scale
    )
)

tanh_output_scale instance-attribute

tanh_output_scale = float(
    _config_get(
        config,
        "tanh_output_scale",
        default=tanh_output_scale,
    )
)

temporal_compression_factor property

temporal_compression_factor: int

decode

decode(latent: Tensor) -> Tensor

encode

encode(audio: Tensor, force_pad: bool = False) -> Tensor

get_audio_num_samples

get_audio_num_samples(num_latent_samples: int) -> int

get_latent_num_samples

get_latent_num_samples(num_audio_samples: int) -> int