vllm_omni.transformers_utils.configs.cosyvoice3 ¶
CosyVoice3Config ¶
Bases: PretrainedConfig
feat_extractor instance-attribute ¶
feat_extractor = {
"n_fft": 1920,
"num_mels": 80,
"sampling_rate": sample_rate,
"hop_size": 480,
"win_size": 1920,
"fmin": 0,
"fmax": None,
"center": False,
}
flow instance-attribute ¶
flow = {
"input_size": 80,
"output_size": 80,
"spk_embed_dim": spk_embed_dim,
"output_type": "mel",
"vocab_size": 6561,
"input_frame_rate": token_frame_rate,
"only_mask_loss": True,
"token_mel_ratio": token_mel_ratio,
"pre_lookahead_len": 3,
"pre_lookahead_layer": {
"in_channels": 80,
"channels": 1024,
"pre_lookahead_len": 3,
},
"decoder": {
"in_channels": 240,
"n_spks": 1,
"spk_emb_dim": 80,
"cfm_params": {
"sigma_min": 1e-06,
"solver": "euler",
"t_scheduler": "cosine",
"training_cfg_rate": 0.2,
"inference_cfg_rate": 0.7,
"reg_loss_type": "l1",
},
"estimator": {
"dim": 1024,
"depth": 22,
"heads": 16,
"dim_head": 64,
"ff_mult": 2,
"mel_dim": 80,
"mu_dim": 80,
"spk_dim": 80,
"out_channels": 80,
"static_chunk_size": token_frame_rate
* token_mel_ratio,
"num_decoding_left_chunks": -1,
},
},
}
hift instance-attribute ¶
hift = {
"in_channels": 80,
"base_channels": 512,
"nb_harmonics": 8,
"sampling_rate": sample_rate,
"nsf_alpha": 0.1,
"nsf_sigma": 0.003,
"nsf_voiced_threshold": 10,
"upsample_rates": [8, 5, 3],
"upsample_kernel_sizes": [16, 11, 7],
"istft_params": {"n_fft": 16, "hop_len": 4},
"resblock_kernel_sizes": [3, 7, 11],
"resblock_dilation_sizes": [
[1, 3, 5],
[1, 3, 5],
[1, 3, 5],
],
"source_resblock_kernel_sizes": [7, 7, 11],
"source_resblock_dilation_sizes": [
[1, 3, 5],
[1, 3, 5],
[1, 3, 5],
],
"lrelu_slope": 0.1,
"audio_limit": 0.99,
"conv_pre_look_right": 4,
"f0_predictor": {
"num_class": 1,
"in_channels": 80,
"cond_channels": 512,
},
}
llm instance-attribute ¶
llm = {
"llm_input_size": llm_input_size,
"llm_output_size": llm_output_size,
"speech_token_size": 6561,
"eos_token_id": 6561 + 1,
"length_normalized_loss": True,
"lsm_weight": 0,
"mix_ratio": [5, 15],
"llm": {"pretrain_path": qwen_pretrain_path},
"sampling": {
"top_p": 0.8,
"top_k": 25,
"win_size": 10,
"tau_r": 0.1,
},
"spk_embed_dim": spk_embed_dim,
}