Skip to content

vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_talker

CosyVoice3LM

Bases: Qwen2LM

eos_token instance-attribute

eos_token = speech_token_size + 1

fill_token instance-attribute

fill_token = speech_token_size + 3

llm instance-attribute

llm = llm

llm_decoder instance-attribute

llm_decoder = Linear(
    llm_output_size, speech_token_size + 200, bias=False
)

llm_input_size instance-attribute

llm_input_size = llm_input_size

llm_output_size instance-attribute

llm_output_size = llm_output_size

mix_ratio instance-attribute

mix_ratio = mix_ratio

sos instance-attribute

sos = speech_token_size + 0

speech_embedding instance-attribute

speech_embedding = Embedding(
    speech_token_size + 200, llm_input_size
)

speech_token_size instance-attribute

speech_token_size = speech_token_size

stop_token_ids instance-attribute

stop_token_ids = [
    (speech_token_size + i) for i in (range(200))
]

task_id instance-attribute

task_id = speech_token_size + 2

vllm_output_queue instance-attribute

vllm_output_queue = {}

Qwen2LM

Bases: TransformerLM

eos_token instance-attribute

eos_token = speech_token_size

fill_token instance-attribute

fill_token = speech_token_size + 2

llm instance-attribute

llm = llm

llm_decoder instance-attribute

llm_decoder = Linear(llm_output_size, speech_token_size + 3)

llm_embedding instance-attribute

llm_embedding = Embedding(2, llm_input_size)

llm_input_size instance-attribute

llm_input_size = llm_input_size

llm_output_size instance-attribute

llm_output_size = llm_output_size

mix_ratio instance-attribute

mix_ratio = mix_ratio

sampling instance-attribute

sampling = sampling

sos instance-attribute

sos = 0

speech_embedding instance-attribute

speech_embedding = Embedding(
    speech_token_size + 3, llm_input_size
)

speech_token_size instance-attribute

speech_token_size = speech_token_size

stop_token_ids instance-attribute

stop_token_ids = [
    (speech_token_size + i) for i in (range(3))
]

task_id instance-attribute

task_id = 1

vllm_output_queue instance-attribute

vllm_output_queue = {}

TransformerLM

Bases: Module

eos_token instance-attribute

eos_token = speech_token_size

llm instance-attribute

llm = llm

llm_decoder instance-attribute

llm_decoder = Linear(llm_output_size, speech_token_size + 1)

llm_embedding instance-attribute

llm_embedding = Embedding(2, llm_input_size)

llm_input_size instance-attribute

llm_input_size = llm_input_size

sos instance-attribute

sos = 0

speech_embedding instance-attribute

speech_embedding = Embedding(
    speech_token_size, llm_input_size
)

speech_token_size instance-attribute

speech_token_size = speech_token_size

spk_embed_affine_layer instance-attribute

spk_embed_affine_layer = Linear(
    spk_embed_dim, llm_input_size
)

task_id instance-attribute

task_id = 1

text_embedding instance-attribute

text_embedding = Embedding(
    text_token_size, text_encoder_input_size
)

text_encoder instance-attribute

text_encoder = text_encoder

text_encoder_affine_layer instance-attribute

text_encoder_affine_layer = Linear(
    output_size(), llm_input_size
)

VLLMQwen2Encoder

Bases: Module

Qwen2 encoder using vLLM's Qwen2Model with external KV cache management.

This replaces the HuggingFace Qwen2ForCausalLM with vLLM's optimized implementation that uses PagedAttention and external KV cache via ForwardContext.

hidden_size instance-attribute

hidden_size = hidden_size

model instance-attribute

model = Qwen2Model(vllm_config=vllm_config, prefix=prefix)

forward

forward(inputs_embeds: Tensor, positions: Tensor) -> Tensor

Forward pass using vLLM's attention with external KV cache.

Parameters:

Name Type Description Default
inputs_embeds Tensor

Input embeddings [total_tokens, hidden_size] or [batch, seq, hidden]

required
positions Tensor

Position tensor for RoPE [total_tokens]

required

Returns:

Name Type Description
hidden_states Tensor

Output hidden states [total_tokens, hidden_size]