Skip to content

vllm_omni.model_executor.models.ming_tts

Modules:

Name Description
aggregator
audio_prep
config_ming_tts

Ming dense checkpoint config adapters.

constants
flowloss_head
fm
ming_tts
ming_tts_audio_vae
ming_tts_llm
patch_emission
pipeline

Ming TTS pipeline: Stage-0 LLM+flow -> Stage-1 audio VAE.

prompt_assembly
prompt_encoder
speaker_extractor
validation

MingAudioVAEModel

Bases: Module

audio instance-attribute

audio = AudioVAE(audio_tokenizer_config)

enable_update_additional_information instance-attribute

enable_update_additional_information = True

has_postprocess instance-attribute

has_postprocess = False

has_preprocess instance-attribute

has_preprocess = False

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

input_modalities class-attribute instance-attribute

input_modalities = 'audio'

ming_config instance-attribute

ming_config = from_hf_config(hf_config)

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = True

vllm_config instance-attribute

vllm_config = vllm_config

chunked_decode_streaming

chunked_decode_streaming(
    latent_chunk: Tensor, *, request_id: str, finished: bool
) -> tuple[Tensor, Any, Any, bool, bool]

compute_logits

compute_logits(
    hidden_states: Tensor | OmniOutput,
    sampling_metadata: Any = None,
) -> None

embed_input_ids

embed_input_ids(input_ids: Tensor, **_: Any) -> Tensor

forward

forward(
    input_ids: Tensor | None = None,
    positions: Tensor | None = None,
    intermediate_tensors: Any = None,
    inputs_embeds: Tensor | None = None,
    model_intermediate_buffer: list[dict[str, Any]]
    | None = None,
    **kwargs: Any,
) -> OmniOutput

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

MingDenseConfig

Bases: PretrainedConfig

aggregator_config instance-attribute

aggregator_config = dict(aggregator_config or {})

audio_tokenizer_config instance-attribute

audio_tokenizer_config = _coerce_audio_vae_config(
    audio_tokenizer_config
)

ditar_config instance-attribute

ditar_config = dict(ditar_config or {})

llm_config instance-attribute

llm_config = _coerce_qwen2_config(llm_config or {})

model_type class-attribute instance-attribute

model_type = 'dense'

get_text_config

get_text_config(
    decoder: bool = False, **kwargs: Any
) -> Qwen2Config

MingLLMModel

Bases: Module

flowloss instance-attribute

flowloss = FlowLoss(
    z_channels=latent_dim,
    llm_cond_dim=llm_hidden_size,
    **(ditar_config),
)

fm_dtype instance-attribute

fm_dtype = _resolve_ming_runtime_dtype(vllm_config)

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={"model.model.": "model."}
)

linear_proj_audio instance-attribute

linear_proj_audio = Aggregator(
    in_channels=latent_dim,
    llm_input_dim=llm_hidden_size,
    **(aggregator_config),
)

ming_config instance-attribute

ming_config = from_hf_config(hf_config)

model instance-attribute

model = Qwen2Model(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)

prefix instance-attribute

prefix = prefix

spk_head instance-attribute

spk_head = Linear(192, llm_hidden_size, bias=True)

stop_head instance-attribute

stop_head = Linear(llm_hidden_size, 2, bias=True)

vllm_config instance-attribute

vllm_config = vllm_config

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Tensor | None

embed_input_ids

embed_input_ids(
    input_ids: Tensor,
    inputs_embeds: Tensor | None = None,
    **_: Any,
) -> Tensor

forward

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    latent_history: Tensor | None = None,
    model_intermediate_buffer: list[dict[str, Any]]
    | None = None,
    seq_token_counts: list[int] | None = None,
    **kwargs: object,
) -> OmniOutput | IntermediateTensors | Tensor

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

pop_postprocess_update

pop_postprocess_update(req_id: str) -> dict[str, Any]

project_speaker_embedding

project_speaker_embedding(spk_emb: Tensor) -> Tensor

sample

sample(logits, sampling_metadata)

MingTTSForConditionalGeneration

Bases: Module, SupportsPP, CustomProcessMixin

has_postprocess instance-attribute

has_postprocess = False

has_preprocess instance-attribute

has_preprocess = False

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

make_empty_intermediate_tensors instance-attribute

make_empty_intermediate_tensors = getattr(
    model, "make_empty_intermediate_tensors", lambda: None
)

ming_config instance-attribute

ming_config = from_hf_config(hf_config)

model instance-attribute

model = init_vllm_registered_model(
    vllm_config=vllm_config, architectures=["MingLLMModel"]
)

model_stage instance-attribute

model_stage = model_stage

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = False

sampler cached property

sampler

vllm_config instance-attribute

vllm_config = vllm_config

compute_logits

compute_logits(hidden_states, sampling_metadata=None)

embed_input_ids

embed_input_ids(input_ids: Tensor, **kwargs: Any) -> Tensor

forward

forward(*args: Any, **kwargs: Any)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

postprocess

postprocess(
    hidden_states: Tensor, **info_dict: Any
) -> dict[str, Any]

preprocess

preprocess(
    input_ids: Tensor,
    input_embeds: Tensor | None,
    **info_dict: Any,
)

sample

sample(logits, sampling_metadata)