Skip to content

vllm_omni.model_executor.models.cosyvoice3.cosyvoice3

logger module-attribute

logger = init_logger(__name__)

CosyVoice3DummyInputsBuilder

Bases: BaseDummyInputsBuilder[CosyVoice3MultiModalProcessingInfo]

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> MultiModalDataDict

get_dummy_processor_inputs

get_dummy_processor_inputs(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> ProcessorInputs

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str

CosyVoice3Model

Bases: Module, SupportsMultiModal

code2wav instance-attribute

code2wav = CosyVoice3Code2Wav(config)

config instance-attribute

config = hf_config

enable_update_additional_information instance-attribute

enable_update_additional_information = True

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

hift instance-attribute

hift = hift

mel_cache_len instance-attribute

mel_cache_len = mel_cache_len

mel_overlap_len instance-attribute

mel_overlap_len = mel_overlap_len

mel_window instance-attribute

mel_window = mel_window

model instance-attribute

model = None

model_dir instance-attribute

model_dir = model_dir

model_stage instance-attribute

model_stage = model_stage

prefer_model_sampler class-attribute instance-attribute

prefer_model_sampler = True

requires_raw_input_tokens class-attribute instance-attribute

requires_raw_input_tokens = True

source_cache_len instance-attribute

source_cache_len = source_cache_len

speech_window instance-attribute

speech_window = speech_window

supports_multimodal class-attribute instance-attribute

supports_multimodal = True

supports_multimodal_raw_input_only class-attribute instance-attribute

supports_multimodal_raw_input_only = True

talker instance-attribute

talker = CosyVoice3LM(
    llm_input_size=llm["llm_input_size"],
    llm_output_size=llm["llm_output_size"],
    speech_token_size=llm["speech_token_size"],
    llm=llm,
    length_normalized_loss=llm["length_normalized_loss"],
    lsm_weight=llm["lsm_weight"],
    mix_ratio=llm["mix_ratio"],
)

token_overlap_len instance-attribute

token_overlap_len = token_overlap_len

compute_logits

compute_logits(
    hidden_states: Tensor | OmniOutput,
) -> Tensor | None

embed_input_ids

embed_input_ids(
    input_ids: Tensor,
    multimodal_embeddings=None,
    is_multimodal=None,
) -> Tensor

embed_multimodal

embed_multimodal(**kwargs: object) -> Tensor

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    additional_information: dict[str, object] | None = None,
    **kwargs: object,
) -> OmniOutput

get_language_model

get_language_model() -> Module

Return the language model for upstream MoE detection.

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

sample

sample(
    logits: Tensor, sampling_metadata: SamplingMetadata
) -> SamplerOutput | None

CosyVoice3MultiModalProcessingInfo

Bases: BaseProcessingInfo

get_data_parser

get_data_parser()

get_hf_config

get_hf_config()

If the config is not already present pass it as a class and it will try to find it in your model directory just copy the config class there also.

get_supported_mm_limits

get_supported_mm_limits() -> Mapping[str, int | None]

How many audio can you pass. I think I should keep it as 1 For now I have kept it None.

CosyVoice3MultiModalProcessor

Bases: BaseMultiModalProcessor[CosyVoice3MultiModalProcessingInfo]