Skip to content

vllm_omni.diffusion.models.cosmos3.action

Action-token helpers for Cosmos3 UVA/action generation.

ACTION_MODES module-attribute

ACTION_MODE_FORWARD_DYNAMICS module-attribute

ACTION_MODE_FORWARD_DYNAMICS = 'forward_dynamics'

ACTION_MODE_INVERSE_DYNAMICS module-attribute

ACTION_MODE_INVERSE_DYNAMICS = 'inverse_dynamics'

ACTION_MODE_POLICY module-attribute

ACTION_MODE_POLICY = 'policy'

EMBODIMENT_TO_DOMAIN_ID module-attribute

EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
    "no_action": 0,
    "av": 1,
    "camera_pose": 2,
    "hand_pose": 3,
    "pusht": 4,
    "libero": 5,
    "umi": 6,
    "bridge_orig_lerobot": 7,
    "droid_lerobot": 8,
    "robomind-franka": 8,
    "galbot": 9,
    "robomind-franka-dual": 12,
    "robomind-ur": 13,
    "agibotworld": 15,
    "agibot_gear_gripper": 15,
    "agibot_gear_gripper_ext": 15,
    "fractal": 20,
}

VIDEO_RES_SIZE_INFO module-attribute

VIDEO_RES_SIZE_INFO: dict[
    str, dict[str, tuple[int, int]]
] = {
    "256": {
        "1,1": (256, 256),
        "4,3": (320, 256),
        "3,4": (256, 320),
        "16,9": (320, 192),
        "9,16": (192, 320),
    },
    "480": {
        "1,1": (640, 640),
        "4,3": (736, 544),
        "3,4": (544, 736),
        "16,9": (832, 480),
        "9,16": (480, 832),
    },
    "704": {
        "1,1": (960, 960),
        "4,3": (1088, 832),
        "3,4": (832, 1088),
        "16,9": (1280, 704),
        "9,16": (704, 1280),
    },
    "720": {
        "1,1": (960, 960),
        "4,3": (1104, 832),
        "3,4": (832, 1104),
        "16,9": (1280, 720),
        "9,16": (720, 1280),
    },
}

action_condition_indexes

action_condition_indexes(
    mode: str, action_length: int
) -> list[int]

action_start_frame_offset

action_start_frame_offset(
    mode: str, action_length: int, video_length: int
) -> int

build_action_condition_mask

build_action_condition_mask(
    mode: str,
    action_length: int,
    *,
    device: device,
    dtype: dtype,
) -> Tensor

build_vision_condition_mask

build_vision_condition_mask(
    mode: str,
    video_length: int,
    temporal_compression_factor: int,
    *,
    device: device,
    dtype: dtype,
) -> Tensor

find_closest_target_size

find_closest_target_size(
    h: int, w: int, resolution: str | int
) -> tuple[int, int]

load_action_tensor

load_action_tensor(action: Any = None) -> Tensor

normalize_action_mode

normalize_action_mode(mode: Any) -> str | None

pad_action_to_dim

pad_action_to_dim(
    action: Tensor, action_dim: int
) -> Tensor

resolve_domain_id

resolve_domain_id(
    *,
    domain_id: Any = None,
    domain_name: Any = None,
    require_explicit: bool = False,
) -> int

vision_condition_indexes

vision_condition_indexes(
    mode: str,
    video_length: int,
    temporal_compression_factor: int,
) -> list[int]