vllm_omni.diffusion.models.cosmos3.action ¶

Action-token helpers for Cosmos3 action generation.

These helpers cover action modes that use action tokens as an auxiliary output stream: policy, forward_dynamics, and inverse_dynamics. The pipeline returns predicted actions through custom_output rather than as a normal modalities entry.

ACTION_MODES `module-attribute` ¶

ACTION_MODES = {
    ACTION_MODE_POLICY,
    ACTION_MODE_FORWARD_DYNAMICS,
    ACTION_MODE_INVERSE_DYNAMICS,
}

ACTION_MODE_FORWARD_DYNAMICS `module-attribute` ¶

ACTION_MODE_FORWARD_DYNAMICS = 'forward_dynamics'

ACTION_MODE_INVERSE_DYNAMICS `module-attribute` ¶

ACTION_MODE_INVERSE_DYNAMICS = 'inverse_dynamics'

ACTION_MODE_POLICY `module-attribute` ¶

ACTION_MODE_POLICY = 'policy'

EMBODIMENT_TO_DOMAIN_ID `module-attribute` ¶

EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
    "no_action": 0,
    "av": 1,
    "camera_pose": 2,
    "hand_pose": 3,
    "pusht": 4,
    "libero": 5,
    "umi": 6,
    "bridge_orig_lerobot": 7,
    "droid_lerobot": 8,
    "robomind-franka": 8,
    "galbot": 9,
    "robomind-franka-dual": 12,
    "robomind-ur": 13,
    "agibotworld": 15,
    "agibot_gear_gripper": 15,
    "agibot_gear_gripper_ext": 15,
    "fractal": 20,
}

VIDEO_RES_SIZE_INFO `module-attribute` ¶

VIDEO_RES_SIZE_INFO: dict[
    str, dict[str, tuple[int, int]]
] = {
    "256": {
        "1,1": (256, 256),
        "4,3": (320, 256),
        "3,4": (256, 320),
        "16,9": (320, 192),
        "9,16": (192, 320),
    },
    "480": {
        "1,1": (640, 640),
        "4,3": (736, 544),
        "3,4": (544, 736),
        "16,9": (832, 480),
        "9,16": (480, 832),
    },
    "704": {
        "1,1": (960, 960),
        "4,3": (1088, 832),
        "3,4": (832, 1088),
        "16,9": (1280, 704),
        "9,16": (704, 1280),
    },
    "720": {
        "1,1": (960, 960),
        "4,3": (1104, 832),
        "3,4": (832, 1104),
        "16,9": (1280, 720),
        "9,16": (720, 1280),
    },
}

action_condition_indexes ¶

action_condition_indexes(
    mode: str, action_length: int
) -> list[int]

action_start_frame_offset ¶

action_start_frame_offset(
    mode: str, action_length: int, video_length: int
) -> int

build_action_condition_mask ¶

build_action_condition_mask(
    mode: str,
    action_length: int,
    *,
    device: device,
    dtype: dtype,
) -> Tensor

build_vision_condition_mask ¶

build_vision_condition_mask(
    mode: str,
    video_length: int,
    temporal_compression_factor: int,
    *,
    device: device,
    dtype: dtype,
) -> Tensor

find_closest_target_size ¶

find_closest_target_size(
    h: int, w: int, resolution: str | int
) -> tuple[int, int]

load_action_tensor ¶

load_action_tensor(action: Any = None) -> Tensor

normalize_action_mode ¶

normalize_action_mode(mode: Any) -> str | None

pad_action_to_dim ¶

pad_action_to_dim(
    action: Tensor, action_dim: int
) -> Tensor

resolve_domain_id ¶

resolve_domain_id(
    *,
    domain_id: Any = None,
    domain_name: Any = None,
    require_explicit: bool = False,
) -> int

vision_condition_indexes ¶

vision_condition_indexes(
    mode: str,
    video_length: int,
    temporal_compression_factor: int,
) -> list[int]