vllm_omni.diffusion.models.cosmos3.action ¶
Action-token helpers for Cosmos3 UVA/action generation.
ACTION_MODES module-attribute ¶
ACTION_MODES = {
ACTION_MODE_POLICY,
ACTION_MODE_FORWARD_DYNAMICS,
ACTION_MODE_INVERSE_DYNAMICS,
}
EMBODIMENT_TO_DOMAIN_ID module-attribute ¶
EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
"no_action": 0,
"av": 1,
"camera_pose": 2,
"hand_pose": 3,
"pusht": 4,
"libero": 5,
"umi": 6,
"bridge_orig_lerobot": 7,
"droid_lerobot": 8,
"robomind-franka": 8,
"galbot": 9,
"robomind-franka-dual": 12,
"robomind-ur": 13,
"agibotworld": 15,
"agibot_gear_gripper": 15,
"agibot_gear_gripper_ext": 15,
"fractal": 20,
}
VIDEO_RES_SIZE_INFO module-attribute ¶
VIDEO_RES_SIZE_INFO: dict[
str, dict[str, tuple[int, int]]
] = {
"256": {
"1,1": (256, 256),
"4,3": (320, 256),
"3,4": (256, 320),
"16,9": (320, 192),
"9,16": (192, 320),
},
"480": {
"1,1": (640, 640),
"4,3": (736, 544),
"3,4": (544, 736),
"16,9": (832, 480),
"9,16": (480, 832),
},
"704": {
"1,1": (960, 960),
"4,3": (1088, 832),
"3,4": (832, 1088),
"16,9": (1280, 704),
"9,16": (704, 1280),
},
"720": {
"1,1": (960, 960),
"4,3": (1104, 832),
"3,4": (832, 1104),
"16,9": (1280, 720),
"9,16": (720, 1280),
},
}
action_start_frame_offset ¶
build_action_condition_mask ¶
build_action_condition_mask(
mode: str,
action_length: int,
*,
device: device,
dtype: dtype,
) -> Tensor
build_vision_condition_mask ¶
build_vision_condition_mask(
mode: str,
video_length: int,
temporal_compression_factor: int,
*,
device: device,
dtype: dtype,
) -> Tensor