Skip to content

vllm_omni.diffusion.models.cosmos3.sound_tokenizer

Cosmos3 sound tokenizer integration.

DEFAULT_SOUND_CHANNELS module-attribute

DEFAULT_SOUND_CHANNELS = 2

DEFAULT_SOUND_DIM module-attribute

DEFAULT_SOUND_DIM = 64

DEFAULT_SOUND_HOP_SIZE module-attribute

DEFAULT_SOUND_HOP_SIZE = 1920

DEFAULT_SOUND_LATENT_FPS module-attribute

DEFAULT_SOUND_LATENT_FPS = (
    DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
)

DEFAULT_SOUND_NORMALIZATION_TYPE module-attribute

DEFAULT_SOUND_NORMALIZATION_TYPE = 'none'

DEFAULT_SOUND_NORMALIZE_LATENTS module-attribute

DEFAULT_SOUND_NORMALIZE_LATENTS = False

DEFAULT_SOUND_SAMPLE_RATE module-attribute

DEFAULT_SOUND_SAMPLE_RATE = 48000

DEFAULT_SOUND_TANH_CLAMP module-attribute

DEFAULT_SOUND_TANH_CLAMP = 0.995

DEFAULT_SOUND_TANH_INPUT_SCALE module-attribute

DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5

DEFAULT_SOUND_TANH_OUTPUT_SCALE module-attribute

DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5

SOUND_TOKENIZER_CHECKPOINT_NAME module-attribute

SOUND_TOKENIZER_CHECKPOINT_NAME = (
    "diffusion_pytorch_model.safetensors"
)

SOUND_TOKENIZER_COMPONENT_NAME module-attribute

SOUND_TOKENIZER_COMPONENT_NAME = 'sound_tokenizer'

logger module-attribute

logger = init_logger(__name__)

Cosmos3SoundTokenizer

Thin adapter around the local AVAE tokenizer implementation.

audio_channels instance-attribute

audio_channels = int(
    getattr(
        tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS
    )
)

hop_size instance-attribute

hop_size = int(
    getattr(
        tokenizer,
        "temporal_compression_factor",
        DEFAULT_SOUND_HOP_SIZE,
    )
)

latent_ch instance-attribute

latent_ch = int(
    getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM)
)

latent_fps instance-attribute

latent_fps = float(sample_rate) / float(hop_size)

sample_rate instance-attribute

sample_rate = int(
    getattr(
        tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE
    )
)

tokenizer instance-attribute

tokenizer = tokenizer

decode

decode(latents: Tensor) -> Tensor

Decode sound latents.

Parameters:

Name Type Description Default
latents Tensor

[B, C, T] or [C, T] tensor.

required

Returns:

Type Description
Tensor

[B, audio_channels, N] tensor for batched input, or

Tensor

[audio_channels, N] for unbatched input.

from_config classmethod

from_config(
    od_config: OmniDiffusionConfig,
) -> Cosmos3SoundTokenizer

get_audio_num_samples

get_audio_num_samples(num_latent_samples: int) -> int

get_latent_num_samples

get_latent_num_samples(num_audio_samples: int) -> int

get_sound_channels

get_sound_channels(od_config: OmniDiffusionConfig) -> int

get_sound_config_value

get_sound_config_value(
    od_config: OmniDiffusionConfig,
    name: str,
    default: Any,
    aliases: tuple[str, ...] = (),
) -> Any

get_sound_hop_size

get_sound_hop_size(od_config: OmniDiffusionConfig) -> int

get_sound_sample_rate

get_sound_sample_rate(
    od_config: OmniDiffusionConfig,
) -> int