Skip to content

vllm_omni.diffusion.models.internvla_a1.model_internvla_a1

InternVLAA1

Bases: Module

action_in_proj instance-attribute

action_in_proj = Linear(max_action_dim, hidden_size)

action_out_proj instance-attribute

action_out_proj = Linear(hidden_size, max_action_dim)

action_time_mlp_in instance-attribute

action_time_mlp_in = Linear(2 * hidden_size, hidden_size)

action_time_mlp_out instance-attribute

action_time_mlp_out = Linear(hidden_size, hidden_size)

config instance-attribute

config = config

cosmos instance-attribute

cosmos = ImageTokenizer(
    checkpoint_enc=str(cosmos_encoder_path),
    checkpoint_dec=str(cosmos_decoder_path),
    device=device,
)

cosmos_in_proj instance-attribute

cosmos_in_proj = Conv2d(
    vae_dim, hidden_size, kernel_size=1, stride=1, padding=0
)

cosmos_out_layer_norm instance-attribute

cosmos_out_layer_norm = LayerNorm(hidden_size)

cosmos_out_proj instance-attribute

cosmos_out_proj = Linear(hidden_size, vae_dim)

downsample_conv instance-attribute

downsample_conv = Conv2d(
    hidden_size,
    hidden_size,
    kernel_size=ds,
    stride=ds,
    padding=0,
)

qwen3_vl_with_expert instance-attribute

qwen3_vl_with_expert = Qwen3VLWithExpertModel(
    vlm_config, action_expert_config, precision=dtype
)

state_proj instance-attribute

state_proj = Linear(max_state_dim, hidden_size)

upsample_conv instance-attribute

upsample_conv = ConvTranspose2d(
    hidden_size,
    hidden_size,
    kernel_size=ds,
    stride=ds,
    padding=0,
)

denoise_step

denoise_step(
    state: Tensor,
    prefix_pad_masks: Tensor,
    past_key_values: Any,
    max_prefix_position_ids: Tensor,
    x_t: Tensor,
    timestep: Tensor,
) -> Tensor

denoise_step_optimized

denoise_step_optimized(
    suffix_static: SuffixStaticContext,
    past_key_values: Any,
    x_t: Tensor,
    timestep: Tensor,
) -> Tensor

embed_middle

embed_middle(
    images: Tensor, img_masks: Tensor
) -> tuple[Tensor, Tensor, Tensor]

embed_prefix

embed_prefix(
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    lang_tokens: Tensor,
    lang_masks: Tensor,
) -> tuple[Tensor, Tensor, Tensor]

embed_suffix

embed_suffix(
    state: Tensor, noisy_actions: Tensor, timestep: Tensor
) -> tuple[Tensor, Tensor, Tensor]

get_cosmos_features

get_cosmos_features(images: Tensor) -> Tensor

get_position_ids

get_position_ids(
    lang_tokens: Tensor,
    image_grid_thw: Tensor | None,
    pad_masks: Tensor,
) -> tuple[Tensor, Any]

prepare_suffix_static_context

prepare_suffix_static_context(
    state: Tensor,
    prefix_pad_masks: Tensor,
    max_prefix_position_ids: Tensor,
) -> SuffixStaticContext

sample_actions

sample_actions(
    images: Tensor,
    img_masks: Tensor,
    pixel_values: Tensor,
    image_grid_thw: Tensor,
    lang_tokens: Tensor,
    lang_masks: Tensor,
    state: Tensor,
    *,
    noise: Tensor | None = None,
    num_steps: int | None = None,
    decode_image: bool = False,
) -> tuple[Tensor, Tensor | None]

sample_noise

sample_noise(
    shape: tuple[int, ...], device: device
) -> Tensor

set_attention_implementation

set_attention_implementation(
    attn_implementation: str,
) -> None

InternVLAA1Policy

Bases: Module

config instance-attribute

config = config

input_builder instance-attribute

input_builder = Qwen3VLInputBuilder(
    processor_model_name=processor_model_name,
    max_length=tokenizer_max_length,
)

model instance-attribute

model = InternVLAA1(
    config,
    cosmos_encoder_path=cosmos_encoder_path,
    cosmos_decoder_path=cosmos_decoder_path,
)

forward

forward(
    batch: dict[str, Any],
    *,
    noise: Tensor | None = None,
    decode_image: bool = False,
) -> tuple[Tensor, Tensor | None]

from_pretrained classmethod

from_pretrained(
    checkpoint_dir: str | Path,
    *,
    config: InternVLAA1Config | None = None,
    processor_model_name: str = DEFAULT_QWEN3_VL_MODEL,
    strict: bool = False,
) -> InternVLAA1Policy

prepare_state

prepare_state(batch: dict[str, Tensor]) -> Tensor

to

to(*args, **kwargs)

Qwen3VLInputBuilder

max_length instance-attribute

max_length = max_length

processor instance-attribute

processor = from_pretrained(
    str(local_path), local_files_only=True
)

spatial_merge_size instance-attribute

spatial_merge_size = spatial_merge_size

build

build(
    camera_images: list[Tensor], task: str
) -> dict[str, Tensor]

Qwen3VLWithExpertModel

Bases: Module

act_expert instance-attribute

act_expert = Qwen3VLTextModel(config=act_expert_config_hf)

gen_expert instance-attribute

gen_expert = Qwen3VLTextModel(config=gen_expert_config_hf)

und_expert instance-attribute

und_expert = Qwen3VLForConditionalGeneration(
    config=vlm_config_hf
)

forward

forward(
    attention_mask: Tensor | None,
    position_ids: LongTensor | None,
    past_key_values: Any,
    inputs_embeds: list[Tensor | None],
    use_cache: bool,
) -> tuple[list[Tensor | None], Any]

to_selected_precision

to_selected_precision(
    precision: Literal["bfloat16", "float32"],
) -> None

QwenConfig

head_dim instance-attribute

head_dim = head_dim

hidden_size instance-attribute

hidden_size = hidden_size

intermediate_size instance-attribute

intermediate_size = intermediate_size

num_attention_heads instance-attribute

num_attention_heads = num_attention_heads

num_hidden_layers instance-attribute

num_hidden_layers = num_hidden_layers

num_key_value_heads instance-attribute

num_key_value_heads = num_key_value_heads

SuffixStaticContext dataclass

full_att_2d_masks_4d instance-attribute

full_att_2d_masks_4d: Tensor

position_ids instance-attribute

position_ids: Tensor

state_emb instance-attribute

state_emb: Tensor

compute_layer_complete

compute_layer_complete(
    layer_idx: int,
    inputs_embeds: list[Tensor],
    attention_mask: Tensor,
    position_ids: Tensor,
    und_expert: Module,
    gen_expert: Module,
    act_expert: Module,
) -> list[Tensor]

create_sinusoidal_pos_embedding

create_sinusoidal_pos_embedding(
    time: Tensor,
    dimension: int,
    min_period: float,
    max_period: float,
    device: device,
) -> Tensor

get_qwen_config

get_qwen_config(variant: str) -> QwenConfig

get_safe_dtype

get_safe_dtype(
    target_dtype: dtype, device_type: str
) -> dtype

make_att_2d_masks

make_att_2d_masks(
    pad_masks: Tensor, att_masks: Tensor
) -> Tensor

pad_vector

pad_vector(vector: Tensor, new_dim: int) -> Tensor

resize_with_pad

resize_with_pad(
    images: Tensor, size: tuple[int, int]
) -> Tensor

resolve_cosmos_checkpoint_paths

resolve_cosmos_checkpoint_paths(
    *,
    encoder_path: str | Path | None = None,
    decoder_path: str | Path | None = None,
) -> tuple[Path, Path]