Skip to content

vllm.envs

CMAKE_BUILD_TYPE module-attribute

CMAKE_BUILD_TYPE: Optional[str] = None

CUDA_VISIBLE_DEVICES module-attribute

CUDA_VISIBLE_DEVICES: Optional[str] = None

K_SCALE_CONSTANT module-attribute

K_SCALE_CONSTANT: int = 200

LD_LIBRARY_PATH module-attribute

LD_LIBRARY_PATH: Optional[str] = None

LOCAL_RANK module-attribute

LOCAL_RANK: int = 0

MAX_JOBS module-attribute

MAX_JOBS: Optional[str] = None

NVCC_THREADS module-attribute

NVCC_THREADS: Optional[str] = None

Q_SCALE_CONSTANT module-attribute

Q_SCALE_CONSTANT: int = 200

S3_ACCESS_KEY_ID module-attribute

S3_ACCESS_KEY_ID: Optional[str] = None

S3_ENDPOINT_URL module-attribute

S3_ENDPOINT_URL: Optional[str] = None

S3_SECRET_ACCESS_KEY module-attribute

S3_SECRET_ACCESS_KEY: Optional[str] = None

VERBOSE module-attribute

VERBOSE: bool = False

VLLM_ALL2ALL_BACKEND module-attribute

VLLM_ALL2ALL_BACKEND: str = 'naive'

VLLM_ALLOW_INSECURE_SERIALIZATION module-attribute

VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False

VLLM_ALLOW_LONG_MAX_MODEL_LEN module-attribute

VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False

VLLM_ALLOW_RUNTIME_LORA_UPDATING module-attribute

VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False

VLLM_API_KEY module-attribute

VLLM_API_KEY: Optional[str] = None

VLLM_ASSETS_CACHE module-attribute

VLLM_ASSETS_CACHE: str = join(VLLM_CACHE_ROOT, 'assets')

VLLM_ATTENTION_BACKEND module-attribute

VLLM_ATTENTION_BACKEND: Optional[str] = None

VLLM_AUDIO_FETCH_TIMEOUT module-attribute

VLLM_AUDIO_FETCH_TIMEOUT: int = 10

VLLM_CACHE_ROOT module-attribute

VLLM_CACHE_ROOT: str = expanduser('~/.cache/vllm')

VLLM_COMPUTE_NANS_IN_LOGITS module-attribute

VLLM_COMPUTE_NANS_IN_LOGITS: bool = False

VLLM_CONFIGURE_LOGGING module-attribute

VLLM_CONFIGURE_LOGGING: int = 1

VLLM_CONFIG_ROOT module-attribute

VLLM_CONFIG_ROOT: str = expanduser('~/.config/vllm')

VLLM_CPU_KVCACHE_SPACE module-attribute

VLLM_CPU_KVCACHE_SPACE: int = 0

VLLM_CPU_MOE_PREPACK module-attribute

VLLM_CPU_MOE_PREPACK: bool = True

VLLM_CPU_NUM_OF_RESERVED_CPU module-attribute

VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0

VLLM_CPU_OMP_THREADS_BIND module-attribute

VLLM_CPU_OMP_THREADS_BIND: str = ''

VLLM_CUDART_SO_PATH module-attribute

VLLM_CUDART_SO_PATH: Optional[str] = None

VLLM_DISABLED_KERNELS module-attribute

VLLM_DISABLED_KERNELS: list[str] = []

VLLM_DISABLE_COMPILE_CACHE module-attribute

VLLM_DISABLE_COMPILE_CACHE: bool = False

VLLM_DO_NOT_TRACK module-attribute

VLLM_DO_NOT_TRACK: bool = False

VLLM_DP_MASTER_IP module-attribute

VLLM_DP_MASTER_IP: str = ''

VLLM_DP_MASTER_PORT module-attribute

VLLM_DP_MASTER_PORT: int = 0

VLLM_DP_RANK module-attribute

VLLM_DP_RANK: int = 0

VLLM_DP_RANK_LOCAL module-attribute

VLLM_DP_RANK_LOCAL: int = -1

VLLM_DP_SIZE module-attribute

VLLM_DP_SIZE: int = 1

VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON module-attribute

VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False

VLLM_ENABLE_V1_MULTIPROCESSING module-attribute

VLLM_ENABLE_V1_MULTIPROCESSING: bool = True

VLLM_ENGINE_ITERATION_TIMEOUT_S module-attribute

VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60

VLLM_FLASHINFER_FORCE_TENSOR_CORES module-attribute

VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False

VLLM_FLASH_ATTN_VERSION module-attribute

VLLM_FLASH_ATTN_VERSION: Optional[int] = None

VLLM_FUSED_MOE_CHUNK_SIZE module-attribute

VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024

VLLM_HOST_IP module-attribute

VLLM_HOST_IP: str = ''

VLLM_HPU_USE_DELAYED_SAMPLING module-attribute

VLLM_HPU_USE_DELAYED_SAMPLING: bool = False

VLLM_HTTP_TIMEOUT_KEEP_ALIVE module-attribute

VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5

VLLM_IMAGE_FETCH_TIMEOUT module-attribute

VLLM_IMAGE_FETCH_TIMEOUT: int = 5

VLLM_KEEP_ALIVE_ON_ENGINE_DEATH module-attribute

VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False

VLLM_KV_CACHE_LAYOUT module-attribute

VLLM_KV_CACHE_LAYOUT: Optional[str] = None

VLLM_LOGGING_CONFIG_PATH module-attribute

VLLM_LOGGING_CONFIG_PATH: Optional[str] = None

VLLM_LOGGING_LEVEL module-attribute

VLLM_LOGGING_LEVEL: str = 'INFO'

VLLM_LOGGING_PREFIX module-attribute

VLLM_LOGGING_PREFIX: str = ''

VLLM_LOGITS_PROCESSOR_THREADS module-attribute

VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None

VLLM_LOG_BATCHSIZE_INTERVAL module-attribute

VLLM_LOG_BATCHSIZE_INTERVAL: float = -1

VLLM_LORA_RESOLVER_CACHE_DIR module-attribute

VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None

VLLM_MARLIN_USE_ATOMIC_ADD module-attribute

VLLM_MARLIN_USE_ATOMIC_ADD: bool = False

VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE module-attribute

VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840

VLLM_MLA_DISABLE module-attribute

VLLM_MLA_DISABLE: bool = False

VLLM_MM_INPUT_CACHE_GIB module-attribute

VLLM_MM_INPUT_CACHE_GIB: int = 8

VLLM_MODEL_REDIRECT_PATH module-attribute

VLLM_MODEL_REDIRECT_PATH: Optional[str] = None

VLLM_MOE_DP_CHUNK_SIZE module-attribute

VLLM_MOE_DP_CHUNK_SIZE: int = 256

VLLM_MQ_MAX_CHUNK_BYTES_MB module-attribute

VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16

VLLM_MSGPACK_ZERO_COPY_THRESHOLD module-attribute

VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256

VLLM_NCCL_SO_PATH module-attribute

VLLM_NCCL_SO_PATH: Optional[str] = None

VLLM_NIXL_SIDE_CHANNEL_HOST module-attribute

VLLM_NIXL_SIDE_CHANNEL_HOST: str = 'localhost'

VLLM_NIXL_SIDE_CHANNEL_PORT module-attribute

VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557

VLLM_NO_DEPRECATION_WARNING module-attribute

VLLM_NO_DEPRECATION_WARNING: bool = False

VLLM_NO_USAGE_STATS module-attribute

VLLM_NO_USAGE_STATS: bool = False

VLLM_PLUGINS module-attribute

VLLM_PLUGINS: Optional[list[str]] = None

VLLM_PORT module-attribute

VLLM_PORT: Optional[int] = None

VLLM_PP_LAYER_PARTITION module-attribute

VLLM_PP_LAYER_PARTITION: Optional[str] = None

VLLM_QUARK_EMU_MEM_OPT module-attribute

VLLM_QUARK_EMU_MEM_OPT: bool = False

VLLM_RANDOMIZE_DP_DUMMY_INPUTS module-attribute

VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False

VLLM_RAY_BUNDLE_INDICES module-attribute

VLLM_RAY_BUNDLE_INDICES: str = ''

VLLM_RAY_PER_WORKER_GPUS module-attribute

VLLM_RAY_PER_WORKER_GPUS: float = 1.0

VLLM_RINGBUFFER_WARNING_INTERVAL module-attribute

VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60

VLLM_ROCM_CUSTOM_PAGED_ATTN module-attribute

VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True

VLLM_ROCM_FP8_PADDING module-attribute

VLLM_ROCM_FP8_PADDING: bool = True

VLLM_ROCM_MOE_PADDING module-attribute

VLLM_ROCM_MOE_PADDING: bool = True

VLLM_ROCM_USE_AITER module-attribute

VLLM_ROCM_USE_AITER: bool = False

VLLM_ROCM_USE_AITER_LINEAR module-attribute

VLLM_ROCM_USE_AITER_LINEAR: bool = True

VLLM_ROCM_USE_AITER_MHA module-attribute

VLLM_ROCM_USE_AITER_MHA: bool = True

VLLM_ROCM_USE_AITER_MLA module-attribute

VLLM_ROCM_USE_AITER_MLA: bool = True

VLLM_ROCM_USE_AITER_MOE module-attribute

VLLM_ROCM_USE_AITER_MOE: bool = True

VLLM_ROCM_USE_AITER_PAGED_ATTN module-attribute

VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False

VLLM_ROCM_USE_AITER_RMSNORM module-attribute

VLLM_ROCM_USE_AITER_RMSNORM: bool = True

VLLM_ROCM_USE_SKINNY_GEMM module-attribute

VLLM_ROCM_USE_SKINNY_GEMM: bool = True

VLLM_RPC_BASE_PATH module-attribute

VLLM_RPC_BASE_PATH: str = gettempdir()

VLLM_RPC_TIMEOUT module-attribute

VLLM_RPC_TIMEOUT: int = 10000

VLLM_SERVER_DEV_MODE module-attribute

VLLM_SERVER_DEV_MODE: bool = False

VLLM_SKIP_P2P_CHECK module-attribute

VLLM_SKIP_P2P_CHECK: bool = False

VLLM_SLEEP_WHEN_IDLE module-attribute

VLLM_SLEEP_WHEN_IDLE: bool = False

VLLM_TARGET_DEVICE module-attribute

VLLM_TARGET_DEVICE: str = 'cuda'

VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL module-attribute

VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False

VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS module-attribute

VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1

VLLM_TORCH_PROFILER_DIR module-attribute

VLLM_TORCH_PROFILER_DIR: Optional[str] = None

VLLM_TPU_BUCKET_PADDING_GAP module-attribute

VLLM_TPU_BUCKET_PADDING_GAP: int = 0

VLLM_TRACE_FUNCTION module-attribute

VLLM_TRACE_FUNCTION: int = 0

VLLM_USAGE_SOURCE module-attribute

VLLM_USAGE_SOURCE: str = ''

VLLM_USAGE_STATS_SERVER module-attribute

VLLM_USAGE_STATS_SERVER: str = 'https://stats.vllm.ai'

VLLM_USE_DEEP_GEMM module-attribute

VLLM_USE_DEEP_GEMM: bool = False

VLLM_USE_FLASHINFER_SAMPLER module-attribute

VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None

VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH module-attribute

VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True

VLLM_USE_MODELSCOPE module-attribute

VLLM_USE_MODELSCOPE: bool = False

VLLM_USE_PRECOMPILED module-attribute

VLLM_USE_PRECOMPILED: bool = False

VLLM_USE_RAY_COMPILED_DAG module-attribute

VLLM_USE_RAY_COMPILED_DAG: bool = False

VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE module-attribute

VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = 'auto'

VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM module-attribute

VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False

VLLM_USE_RAY_SPMD_WORKER module-attribute

VLLM_USE_RAY_SPMD_WORKER: bool = False

VLLM_USE_TRITON_AWQ module-attribute

VLLM_USE_TRITON_AWQ: bool = False

VLLM_USE_TRITON_FLASH_ATTN module-attribute

VLLM_USE_TRITON_FLASH_ATTN: bool = True

VLLM_USE_V1 module-attribute

VLLM_USE_V1: bool = True

VLLM_V0_USE_OUTLINES_CACHE module-attribute

VLLM_V0_USE_OUTLINES_CACHE: bool = False

VLLM_V1_OUTPUT_PROC_CHUNK_SIZE module-attribute

VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128

VLLM_V1_USE_PREFILL_DECODE_ATTENTION module-attribute

VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False

VLLM_VIDEO_FETCH_TIMEOUT module-attribute

VLLM_VIDEO_FETCH_TIMEOUT: int = 30

VLLM_VIDEO_LOADER_BACKEND module-attribute

VLLM_VIDEO_LOADER_BACKEND: str = 'opencv'

VLLM_WORKER_MULTIPROC_METHOD module-attribute

VLLM_WORKER_MULTIPROC_METHOD: str = 'fork'

VLLM_XGRAMMAR_CACHE_MB module-attribute

VLLM_XGRAMMAR_CACHE_MB: int = 0

VLLM_XLA_CACHE_PATH module-attribute

VLLM_XLA_CACHE_PATH: str = join(
    VLLM_CACHE_ROOT, "xla_cache"
)

VLLM_XLA_CHECK_RECOMPILATION module-attribute

VLLM_XLA_CHECK_RECOMPILATION: bool = False

VLLM_XLA_USE_SPMD module-attribute

VLLM_XLA_USE_SPMD: bool = False

V_SCALE_CONSTANT module-attribute

V_SCALE_CONSTANT: int = 100

environment_variables module-attribute

environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_TARGET_DEVICE": lambda: getenv(
        "VLLM_TARGET_DEVICE", "cuda"
    ),
    "MAX_JOBS": lambda: getenv("MAX_JOBS", None),
    "NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
    "VLLM_USE_PRECOMPILED": lambda: bool(
        get("VLLM_USE_PRECOMPILED")
    )
    or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
        int(
            getenv(
                "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
                "0",
            )
        )
    ),
    "CMAKE_BUILD_TYPE": lambda: getenv("CMAKE_BUILD_TYPE"),
    "VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
    "VLLM_CONFIG_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CONFIG_ROOT",
            join(get_default_config_root(), "vllm"),
        )
    ),
    "VLLM_CACHE_ROOT": lambda: expanduser(
        getenv(
            "VLLM_CACHE_ROOT",
            join(get_default_cache_root(), "vllm"),
        )
    ),
    "VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
    "VLLM_PORT": get_vllm_port,
    "VLLM_RPC_BASE_PATH": lambda: getenv(
        "VLLM_RPC_BASE_PATH", gettempdir()
    ),
    "VLLM_USE_MODELSCOPE": lambda: lower() == "true",
    "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
        get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
    ),
    "CUDA_HOME": lambda: get("CUDA_HOME", None),
    "VLLM_NCCL_SO_PATH": lambda: get(
        "VLLM_NCCL_SO_PATH", None
    ),
    "LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
    "VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
    in ("true", "1"),
    "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
        get("VLLM_FLASH_ATTN_VERSION", None)
    ),
    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool(
        get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        != "0"
    ),
    "VLLM_USE_STANDALONE_COMPILE": lambda: get(
        "VLLM_USE_STANDALONE_COMPILE", "1"
    )
    == "1",
    "LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
    "CUDA_VISIBLE_DEVICES": lambda: get(
        "CUDA_VISIBLE_DEVICES", None
    ),
    "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
        get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
    ),
    "VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
    == "true",
    "S3_ACCESS_KEY_ID": lambda: get(
        "S3_ACCESS_KEY_ID", None
    ),
    "S3_SECRET_ACCESS_KEY": lambda: get(
        "S3_SECRET_ACCESS_KEY", None
    ),
    "S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
    "VLLM_USAGE_STATS_SERVER": lambda: get(
        "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
    ),
    "VLLM_NO_USAGE_STATS": lambda: get(
        "VLLM_NO_USAGE_STATS", "0"
    )
    == "1",
    "VLLM_DO_NOT_TRACK": lambda: get(
        "VLLM_DO_NOT_TRACK", None
    )
    or get("DO_NOT_TRACK", None)
    or "0" == "1",
    "VLLM_USAGE_SOURCE": lambda: get(
        "VLLM_USAGE_SOURCE", "production"
    ),
    "VLLM_CONFIGURE_LOGGING": lambda: int(
        getenv("VLLM_CONFIGURE_LOGGING", "1")
    ),
    "VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
        "VLLM_LOGGING_CONFIG_PATH"
    ),
    "VLLM_LOGGING_LEVEL": lambda: upper(),
    "VLLM_LOGGING_PREFIX": lambda: getenv(
        "VLLM_LOGGING_PREFIX", ""
    ),
    "VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
        getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
    )
    if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
    else None,
    "VLLM_TRACE_FUNCTION": lambda: int(
        getenv("VLLM_TRACE_FUNCTION", "0")
    ),
    "VLLM_ATTENTION_BACKEND": lambda: getenv(
        "VLLM_ATTENTION_BACKEND", None
    ),
    "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
        int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
    )
    if "VLLM_USE_FLASHINFER_SAMPLER" in environ
    else None,
    "VLLM_FLASHINFER_FORCE_TENSOR_CORES": lambda: bool(
        int(
            getenv(
                "VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"
            )
        )
    ),
    "VLLM_PP_LAYER_PARTITION": lambda: getenv(
        "VLLM_PP_LAYER_PARTITION", None
    ),
    "VLLM_CPU_KVCACHE_SPACE": lambda: int(
        getenv("VLLM_CPU_KVCACHE_SPACE", "0")
    ),
    "VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
        "VLLM_CPU_OMP_THREADS_BIND", "auto"
    ),
    "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
        getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
    ),
    "VLLM_CPU_MOE_PREPACK": lambda: bool(
        int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
    ),
    "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
        int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
    ),
    "VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
        int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
    ),
    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": lambda: getenv(
        "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"
    ),
    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
        int(
            getenv(
                "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
                "0",
            )
        )
    ),
    "VLLM_WORKER_MULTIPROC_METHOD": lambda: getenv(
        "VLLM_WORKER_MULTIPROC_METHOD", "fork"
    ),
    "VLLM_ASSETS_CACHE": lambda: expanduser(
        getenv(
            "VLLM_ASSETS_CACHE",
            join(
                get_default_cache_root(), "vllm", "assets"
            ),
        )
    ),
    "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
    ),
    "VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
    ),
    "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
        getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
    ),
    "VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
        "VLLM_VIDEO_LOADER_BACKEND", "opencv"
    ),
    "VLLM_MM_INPUT_CACHE_GIB": lambda: int(
        getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
    ),
    "VLLM_XLA_CACHE_PATH": lambda: expanduser(
        getenv(
            "VLLM_XLA_CACHE_PATH",
            join(
                get_default_cache_root(),
                "vllm",
                "xla_cache",
            ),
        )
    ),
    "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
        int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
    ),
    "VLLM_XLA_USE_SPMD": lambda: bool(
        int(getenv("VLLM_XLA_USE_SPMD", "0"))
    ),
    "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
        getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
    ),
    "VLLM_NO_DEPRECATION_WARNING": lambda: bool(
        int(getenv("VLLM_NO_DEPRECATION_WARNING", "0"))
    ),
    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
        getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
    ),
    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
    in ("1", "true"),
    "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
        "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
    ),
    "VLLM_RPC_TIMEOUT": lambda: int(
        getenv("VLLM_RPC_TIMEOUT", "10000")
    ),
    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
        get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
    ),
    "VLLM_PLUGINS": lambda: None
    if "VLLM_PLUGINS" not in environ
    else split(","),
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
    "VLLM_TORCH_PROFILER_DIR": lambda: None
    if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
    else expanduser(getenv("VLLM_TORCH_PROFILER_DIR", ".")),
    "VLLM_USE_TRITON_AWQ": lambda: bool(
        int(getenv("VLLM_USE_TRITON_AWQ", "0"))
    ),
    "VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
    in ("1", "true"),
    "VLLM_SKIP_P2P_CHECK": lambda: getenv(
        "VLLM_SKIP_P2P_CHECK", "0"
    )
    == "1",
    "VLLM_DISABLED_KERNELS": lambda: []
    if "VLLM_DISABLED_KERNELS" not in environ
    else split(","),
    "VLLM_USE_V1": lambda: bool(
        int(getenv("VLLM_USE_V1", "1"))
    ),
    "VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
    "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MOE": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MLA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_AITER_MHA": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
    in ("true", "1"),
    "VLLM_ROCM_FP8_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
    ),
    "VLLM_ROCM_MOE_PADDING": lambda: bool(
        int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
    ),
    "VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
    in ("true", "1"),
    "VLLM_QUARK_EMU_MEM_OPT": lambda: bool(
        int(getenv("VLLM_QUARK_EMU_MEM_OPT", "0"))
    ),
    "Q_SCALE_CONSTANT": lambda: int(
        getenv("Q_SCALE_CONSTANT", "200")
    ),
    "K_SCALE_CONSTANT": lambda: int(
        getenv("K_SCALE_CONSTANT", "200")
    ),
    "V_SCALE_CONSTANT": lambda: int(
        getenv("V_SCALE_CONSTANT", "100")
    ),
    "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
        int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
    ),
    "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
        getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
    ),
    "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
        int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
    ),
    "VLLM_SERVER_DEV_MODE": lambda: bool(
        int(getenv("VLLM_SERVER_DEV_MODE", "0"))
    ),
    "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
        getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
    ),
    "VLLM_MLA_DISABLE": lambda: bool(
        int(getenv("VLLM_MLA_DISABLE", "0"))
    ),
    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": lambda: bool(
        int(
            getenv(
                "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON",
                "0",
            )
        )
    ),
    "VLLM_RAY_PER_WORKER_GPUS": lambda: float(
        getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
    ),
    "VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
        "VLLM_RAY_BUNDLE_INDICES", ""
    ),
    "VLLM_CUDART_SO_PATH": lambda: getenv(
        "VLLM_CUDART_SO_PATH", None
    ),
    "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH": lambda: lower()
    in ("1", "true"),
    "VLLM_HPU_USE_DELAYED_SAMPLING": lambda: lower()
    in ("1", "true"),
    "VLLM_DP_RANK": lambda: int(
        getenv("VLLM_DP_RANK", "0")
    ),
    "VLLM_DP_RANK_LOCAL": lambda: int(
        getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
    ),
    "VLLM_DP_SIZE": lambda: int(
        getenv("VLLM_DP_SIZE", "1")
    ),
    "VLLM_DP_MASTER_IP": lambda: getenv(
        "VLLM_DP_MASTER_IP", "127.0.0.1"
    ),
    "VLLM_DP_MASTER_PORT": lambda: int(
        getenv("VLLM_DP_MASTER_PORT", "0")
    ),
    "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
        getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
    ),
    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
    )
    == "1",
    "VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
    == "1",
    "VLLM_MODEL_REDIRECT_PATH": lambda: get(
        "VLLM_MODEL_REDIRECT_PATH", None
    ),
    "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
        "VLLM_MARLIN_USE_ATOMIC_ADD", "0"
    )
    == "1",
    "VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
        "VLLM_V0_USE_OUTLINES_CACHE", "0"
    )
    == "1",
    "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
        environ["VLLM_TPU_BUCKET_PADDING_GAP"]
    )
    if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
    else 0,
    "VLLM_USE_DEEP_GEMM": lambda: bool(
        int(getenv("VLLM_USE_DEEP_GEMM", "0"))
    ),
    "VLLM_XGRAMMAR_CACHE_MB": lambda: int(
        getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
    ),
    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
        getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
    ),
    "VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
        int(
            getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
        )
    ),
    "VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
        "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
    ),
    "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
        getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")
    ),
    "VLLM_ALL2ALL_BACKEND": lambda: getenv(
        "VLLM_ALL2ALL_BACKEND", "naive"
    ),
    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
        getenv(
            "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
        )
    ),
    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
        getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
    ),
    "VLLM_SLEEP_WHEN_IDLE": lambda: bool(
        int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
    ),
    "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
        getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
    ),
    "VLLM_KV_CACHE_LAYOUT": lambda: getenv(
        "VLLM_KV_CACHE_LAYOUT", None
    ),
    "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
        int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
    ),
}

__dir__

__dir__()
Source code in vllm/envs.py
def __dir__():
    return list(environment_variables.keys())

__getattr__

__getattr__(name: str)
Source code in vllm/envs.py
def __getattr__(name: str):
    # lazy evaluation of environment variables
    if name in environment_variables:
        return environment_variables[name]()
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

compute_hash

compute_hash() -> str

WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.

Source code in vllm/envs.py
def compute_hash() -> str:
    """
    WARNING: Whenever a new key is added to this environment
    variables, ensure that it is included in the factors list if
    it affects the computation graph. For example, different values
    of VLLM_PP_LAYER_PARTITION will generate different computation
    graphs, so it is included in the factors list. The env vars that
    affect the choice of different kernels or attention backends should
    also be included in the factors list.
    """
    factors: list[Any] = []

    # summarize environment variables
    def factorize(name: str):
        if __getattr__(name):
            factors.append(__getattr__(name))
        else:
            factors.append("None")

    # The values of envs may affects the computation graph.
    # TODO(DefTruth): hash all environment variables?
    # for key in environment_variables:
    #     factorize(key)
    environment_variables_to_hash = [
        "VLLM_PP_LAYER_PARTITION",
        "VLLM_MLA_DISABLE",
        "VLLM_USE_TRITON_FLASH_ATTN",
        "VLLM_USE_TRITON_AWQ",
        "VLLM_DP_RANK",
        "VLLM_DP_SIZE",
        "VLLM_USE_STANDALONE_COMPILE",
    ]
    for key in environment_variables_to_hash:
        if key in environment_variables:
            factorize(key)

    hash_str = hashlib.md5(str(factors).encode(),
                           usedforsecurity=False).hexdigest()

    return hash_str

get_default_cache_root

get_default_cache_root()
Source code in vllm/envs.py
def get_default_cache_root():
    return os.getenv(
        "XDG_CACHE_HOME",
        os.path.join(os.path.expanduser("~"), ".cache"),
    )

get_default_config_root

get_default_config_root()
Source code in vllm/envs.py
def get_default_config_root():
    return os.getenv(
        "XDG_CONFIG_HOME",
        os.path.join(os.path.expanduser("~"), ".config"),
    )

get_vllm_port

get_vllm_port() -> Optional[int]

Get the port from VLLM_PORT environment variable.

Returns:

Type Description
Optional[int]

The port number as an integer if VLLM_PORT is set, None otherwise.

Raises:

Type Description
ValueError

If VLLM_PORT is a URI, suggest k8s service discovery issue.

Source code in vllm/envs.py
def get_vllm_port() -> Optional[int]:
    """Get the port from VLLM_PORT environment variable.

    Returns:
        The port number as an integer if VLLM_PORT is set, None otherwise.

    Raises:
        ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue.
    """
    if 'VLLM_PORT' not in os.environ:
        return None

    port = os.getenv('VLLM_PORT', '0')

    try:
        return int(port)
    except ValueError as err:
        from urllib.parse import urlparse
        parsed = urlparse(port)
        if parsed.scheme:
            raise ValueError(
                f"VLLM_PORT '{port}' appears to be a URI. "
                "This may be caused by a Kubernetes service discovery issue,"
                "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
            ) from None
        raise ValueError(
            f"VLLM_PORT '{port}' must be a valid integer") from err

is_set

is_set(name: str)

Check if an environment variable is explicitly set.

Source code in vllm/envs.py
def is_set(name: str):
    """Check if an environment variable is explicitly set."""
    if name in environment_variables:
        return name in os.environ
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

maybe_convert_int

maybe_convert_int(value: Optional[str]) -> Optional[int]
Source code in vllm/envs.py
def maybe_convert_int(value: Optional[str]) -> Optional[int]:
    if value is None:
        return None
    return int(value)

set_vllm_use_v1

set_vllm_use_v1(use_v1: bool)
Source code in vllm/envs.py
def set_vllm_use_v1(use_v1: bool):
    if is_set("VLLM_USE_V1"):
        raise ValueError(
            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
            "explicitly by the user. Please raise this as a Github "
            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"