Skip to content

vllm_omni.transformers_utils.configs.ming_flash_omni

Configuration for Ming-flash-omni-2.0 model

logger module-attribute

logger = get_logger(__name__)

BailingMM2Config

Bases: PretrainedConfig

audio_config instance-attribute

audio_config = (
    WhisperEncoderConfig(**audio_config)
    if isinstance(audio_config, dict)
    else audio_config
)

ignore_keys_at_rope_validation class-attribute instance-attribute

ignore_keys_at_rope_validation = {'mrope_section'}

is_composition class-attribute instance-attribute

is_composition = True

llm_config instance-attribute

llm_config = (
    BailingMoeV2Config(**llm_config)
    if isinstance(llm_config, dict)
    else llm_config
)

mlp_depth instance-attribute

mlp_depth = mlp_depth

model_type class-attribute instance-attribute

model_type = 'bailingmm_moe_v2_lite'

sub_configs class-attribute instance-attribute

sub_configs: ClassVar = {'llm_config': AutoConfig}

vision_config instance-attribute

vision_config = (
    Qwen3VLMoeVisionConfig(**vision_config)
    if isinstance(vision_config, dict)
    else vision_config
)

get_text_config

get_text_config(decoder: bool = False) -> PretrainedConfig

BailingMoeV2Config

Bases: PretrainedConfig

attention_dropout instance-attribute

attention_dropout = attention_dropout

audio_end_token instance-attribute

audio_end_token = audio_end_token

audio_patch_token instance-attribute

audio_patch_token = audio_patch_token

audio_start_token instance-attribute

audio_start_token = audio_start_token

embedding_dropout instance-attribute

embedding_dropout = embedding_dropout

first_k_dense_replace instance-attribute

first_k_dense_replace = first_k_dense_replace

head_dim instance-attribute

head_dim = head_dim or hidden_size // num_attention_heads

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

ignore_keys_at_rope_validation class-attribute instance-attribute

ignore_keys_at_rope_validation = {'mrope_section'}

image_end_token instance-attribute

image_end_token = image_end_token

image_patch_token instance-attribute

image_patch_token = image_patch_token

image_start_token instance-attribute

image_start_token = image_start_token

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

max_position_embeddings instance-attribute

max_position_embeddings = max_position_embeddings

max_window_layers instance-attribute

max_window_layers = max_window_layers

model_type class-attribute instance-attribute

model_type = 'bailing_moe_v2'

moe_intermediate_size instance-attribute

moe_intermediate_size = moe_intermediate_size

n_group instance-attribute

n_group = n_group

norm_head instance-attribute

norm_head = norm_head

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_experts instance-attribute

num_experts = num_experts

num_experts_per_tok instance-attribute

num_experts_per_tok = num_experts_per_tok

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

num_shared_experts instance-attribute

num_shared_experts = num_shared_experts

output_dropout instance-attribute

output_dropout = output_dropout

output_router_logits instance-attribute

output_router_logits = output_router_logits

partial_rotary_factor instance-attribute

partial_rotary_factor = partial_rotary_factor

rms_norm_eps instance-attribute

rms_norm_eps = rms_norm_eps

rope_parameters instance-attribute

rope_parameters = {'mrope_section': mrope_section}

rope_scaling instance-attribute

rope_scaling = rope_scaling

rope_theta instance-attribute

rope_theta = rope_theta

routed_scaling_factor instance-attribute

routed_scaling_factor = routed_scaling_factor

router_type instance-attribute

router_type = router_type

sliding_window instance-attribute

sliding_window = sliding_window

spatial_merge_size instance-attribute

spatial_merge_size = spatial_merge_size

tokens_per_second instance-attribute

tokens_per_second = tokens_per_second

topk_group instance-attribute

topk_group = topk_group

use_bias instance-attribute

use_bias = use_bias

use_cache instance-attribute

use_cache = use_cache

use_interleaved_frame_timestamp instance-attribute

use_interleaved_frame_timestamp = (
    use_interleaved_frame_timestamp
)

use_qk_norm instance-attribute

use_qk_norm = use_qk_norm

use_qkv_bias instance-attribute

use_qkv_bias = use_qkv_bias

use_sliding_window instance-attribute

use_sliding_window = use_sliding_window

video_end_token instance-attribute

video_end_token = video_end_token

video_patch_token instance-attribute

video_patch_token = video_patch_token

video_start_token instance-attribute

video_start_token = video_start_token

vocab_size instance-attribute

vocab_size = vocab_size

MingFlashOmniConfig

Bases: PretrainedConfig

Configuration class for unified Ming-flash-omni-2.0 model

image_gen_config instance-attribute

image_gen_config = MingImageGenConfig(**image_gen_config)

is_composition class-attribute instance-attribute

is_composition = True

model_type class-attribute instance-attribute

model_type = 'ming_flash_omni'

sub_configs class-attribute instance-attribute

sub_configs: ClassVar = {
    "thinker_config": BailingMM2Config,
    "image_gen_config": MingImageGenConfig,
    "talker_config": MingFlashOmniTalkerConfig,
}

talker_config instance-attribute

talker_config = MingFlashOmniTalkerConfig(**talker_config)

thinker_config instance-attribute

thinker_config = BailingMM2Config(**thinker_config)

get_text_config

get_text_config(decoder: bool = False) -> PretrainedConfig

MingFlashOmniTalkerConfig

Bases: PretrainedConfig

Configuration class for Ming-flash-omni-2.0 talker (TTS) stage.

The talker uses a Qwen2 LLM backbone with CFM (Conditional Flow Matching) via a DiT diffusion transformer, plus an Aggregator that maps generated audio latents back to the LLM embedding space for autoregressive generation.

aggregator instance-attribute

aggregator = aggregator or {}

audio_vae_path instance-attribute

audio_vae_path = audio_vae_path

campplus_model instance-attribute

campplus_model = campplus_model

cfg_strength instance-attribute

cfg_strength = cfg_strength

flowmodel instance-attribute

flowmodel = flowmodel or {}

history_patch_size instance-attribute

history_patch_size = history_patch_size

latent_dim instance-attribute

latent_dim = latent_dim

llm_config instance-attribute

llm_config = llm_config

model_type class-attribute instance-attribute

model_type = 'ming_flash_omni_talker'

patch_size instance-attribute

patch_size = patch_size

steps instance-attribute

steps = steps

get_text_config

get_text_config(decoder: bool = False) -> PretrainedConfig

MingImageGenConfig

Bases: PretrainedConfig

Configuration for Ming-flash-omni-2.0 image generation stage.

Mirrors the layout of the HF checkpoint at https://huggingface.co/inclusionAI/Ming-flash-omni-2.0 where image-gen components live in sibling subfolders (connector/, transformer/, vae/, scheduler/, mlp/).

connector_subfolder instance-attribute

connector_subfolder = connector_subfolder

default_height instance-attribute

default_height = default_height

default_width instance-attribute

default_width = default_width

diffusion_c_input_dim instance-attribute

diffusion_c_input_dim = diffusion_c_input_dim

guidance_scale instance-attribute

guidance_scale = guidance_scale

img_gen_scales instance-attribute

img_gen_scales = (
    img_gen_scales if img_gen_scales is not None else [16]
)

mlp_subfolder instance-attribute

mlp_subfolder = mlp_subfolder

model_type class-attribute instance-attribute

model_type = 'ming_flash_omni_imagegen'

num_inference_steps instance-attribute

num_inference_steps = num_inference_steps

num_query_tokens property

num_query_tokens: int

Total number of learnable query tokens appended to the thinker input.

For img_gen_scales=[16] this yields 256 tokens (a single 16x16 grid).

scheduler_subfolder instance-attribute

scheduler_subfolder = scheduler_subfolder

text_encoder_norm instance-attribute

text_encoder_norm = text_encoder_norm

thinker_hidden_size instance-attribute

thinker_hidden_size = thinker_hidden_size

transformer_subfolder instance-attribute

transformer_subfolder = transformer_subfolder

vae_subfolder instance-attribute

vae_subfolder = vae_subfolder

Qwen3VLMoeVisionConfig

Bases: PretrainedConfig

Configuration class for Qwen3 MoE Vision Transformer

deepstack_visual_indexes instance-attribute

deepstack_visual_indexes = deepstack_visual_indexes

depth instance-attribute

depth = depth

hidden_act instance-attribute

hidden_act = hidden_act

hidden_size instance-attribute

hidden_size = hidden_size

in_channels instance-attribute

in_channels = in_channels

initializer_range instance-attribute

initializer_range = initializer_range

intermediate_size instance-attribute

intermediate_size = intermediate_size

model_type class-attribute instance-attribute

model_type = 'qwen3_moe_vit'

num_heads instance-attribute

num_heads = num_heads

num_position_embeddings instance-attribute

num_position_embeddings = num_position_embeddings

out_hidden_size instance-attribute

out_hidden_size = out_hidden_size

patch_size instance-attribute

patch_size = patch_size

spatial_merge_size instance-attribute

spatial_merge_size = spatial_merge_size

temporal_patch_size instance-attribute

temporal_patch_size = temporal_patch_size

from_pretrained classmethod

from_pretrained(
    pretrained_model_name_or_path: str | PathLike, **kwargs
) -> PretrainedConfig

WhisperEncoderConfig

Bases: PretrainedConfig

Configuration class for Whisper audio encoder

ds_kernel_size instance-attribute

ds_kernel_size = ds_kernel_size

ds_stride instance-attribute

ds_stride = ds_stride

model_type class-attribute instance-attribute

model_type = 'whisper_encoder'

norm_query_embeds instance-attribute

norm_query_embeds = norm_query_embeds

whisper_encoder_config instance-attribute

whisper_encoder_config = whisper_encoder_config or {}