Skip to content

vllm_omni.model_executor.models.dynin_omni

Modules:

Name Description
dynin_omni
dynin_omni_common
dynin_omni_token2audio
dynin_omni_token2image
dynin_omni_token2text
pipeline

Dynin-Omni pipeline topology (frozen).

DyninOmniForConditionalGeneration

Bases: Module, SupportsMultiModal

STAGE_ALIAS class-attribute instance-attribute

STAGE_ALIAS = {
    "tokenizer": "token2text",
    "token2token": "token2text",
    "detok_text": "token2text",
    "token2img": "token2image",
    "token2wav": "token2audio",
    "token2speech": "token2audio",
}

STAGE_IMPL class-attribute instance-attribute

STAGE_IMPL = {
    "token2text": (
        ".dynin_omni_token2text",
        "DyninOmniToken2Text",
    ),
    "token2image": (
        ".dynin_omni_token2image",
        "DyninOmniToken2Image",
    ),
    "token2audio": (
        ".dynin_omni_token2audio",
        "DyninOmniToken2Audio",
    ),
}

has_postprocess instance-attribute

has_postprocess = False

has_preprocess instance-attribute

has_preprocess = False

have_multimodal_outputs instance-attribute

have_multimodal_outputs = getattr(
    impl, "have_multimodal_outputs", True
)

impl instance-attribute

impl = impl_cls(vllm_config=vllm_config, prefix=prefix)

language_model instance-attribute

language_model = _resolve_language_model()

model instance-attribute

model = impl

model_stage instance-attribute

model_stage = _normalize_stage_name(raw_stage)

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = getattr(
    impl, "requires_raw_input_tokens", True
)

sampler cached property

sampler

supports_multimodal_raw_input_only class-attribute instance-attribute

supports_multimodal_raw_input_only = True

compute_logits

compute_logits(
    hidden_states: Tensor | OmniOutput,
    sampling_metadata: Any = None,
) -> Tensor | None

embed_input_ids

embed_input_ids(
    input_ids: Tensor,
    multimodal_embeddings: Any = None,
    is_multimodal: Tensor | None = None,
    **kwargs: Any,
) -> Tensor

embed_multimodal

embed_multimodal(**kwargs: Any) -> Any

forward

forward(
    input_ids: Tensor | None = None,
    positions: Tensor | None = None,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: Any,
) -> OmniOutput

get_language_model

get_language_model() -> Any | None

get_placeholder_str classmethod

get_placeholder_str(modality: str, i: int) -> str | None

init_multi_modal

init_multi_modal(thinker_config: Any = None) -> None

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

make_empty_intermediate_tensors

make_empty_intermediate_tensors(
    batch_size: int, dtype: dtype, device: device
) -> IntermediateTensors

sample

sample(
    logits: Tensor, sampling_metadata: SamplingMetadata
) -> SamplerOutput | None

DyninOmniToken2Audio

Bases: DyninOmniStageBase

Stage-3: token detokenization to speech (or pass-through).

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

hidden_size instance-attribute

hidden_size = resolve_hidden_size(vllm_config=vllm_config)

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = True

stage_name class-attribute instance-attribute

stage_name = 'Dynin token2audio'

vllm_config instance-attribute

vllm_config = vllm_config

embed_multimodal

embed_multimodal(**kwargs: Any) -> Any

forward

forward(
    input_ids: Tensor | None = None,
    positions: Tensor | None = None,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: Any,
) -> OmniOutput

DyninOmniToken2Image

Bases: DyninOmniStageBase

Stage-2: token detokenization to image (or pass-through).

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

hidden_size instance-attribute

hidden_size = resolve_hidden_size(vllm_config=vllm_config)

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = True

stage_name class-attribute instance-attribute

stage_name = 'Dynin token2image'

vllm_config instance-attribute

vllm_config = vllm_config

embed_multimodal

embed_multimodal(**kwargs: Any) -> Any

forward

forward(
    input_ids: Tensor | None = None,
    positions: Tensor | None = None,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: Any,
) -> OmniOutput

DyninOmniToken2Text

Bases: DyninOmniStageBase

Stage-1: DYNIN generation + text detokenization or pass-through.

have_multimodal_outputs instance-attribute

have_multimodal_outputs = True

hidden_size instance-attribute

hidden_size = resolve_hidden_size(
    vllm_config=vllm_config, model=model
)

model instance-attribute

model = _load_text_model(
    model_source, local_files_only=model_local_files_only
)

requires_raw_input_tokens instance-attribute

requires_raw_input_tokens = True

stage_name class-attribute instance-attribute

stage_name = 'Dynin token2text'

tokenizer instance-attribute

tokenizer: Any | None = None

vllm_config instance-attribute

vllm_config = vllm_config

embed_multimodal

embed_multimodal(**kwargs: Any) -> Any

forward

forward(
    input_ids: Tensor | None = None,
    positions: Tensor | None = None,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: Any,
) -> OmniOutput

get_language_model

get_language_model() -> Any