vllm.envs
VLLM_ALLOW_INSECURE_SERIALIZATION
module-attribute
¶
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON
module-attribute
¶
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_FLASHINFER_FORCE_TENSOR_CORES
module-attribute
¶
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
VLLM_LOGITS_PROCESSOR_THREADS
module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
module-attribute
¶
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
module-attribute
¶
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
module-attribute
¶
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
module-attribute
¶
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = 'auto'
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
module-attribute
¶
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_V1_USE_PREFILL_DECODE_ATTENTION
module-attribute
¶
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
VLLM_XLA_CACHE_PATH
module-attribute
¶
VLLM_XLA_CACHE_PATH: str = join(
VLLM_CACHE_ROOT, "xla_cache"
)
environment_variables
module-attribute
¶
environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TARGET_DEVICE": lambda: getenv(
"VLLM_TARGET_DEVICE", "cuda"
),
"MAX_JOBS": lambda: getenv("MAX_JOBS", None),
"NVCC_THREADS": lambda: getenv("NVCC_THREADS", None),
"VLLM_USE_PRECOMPILED": lambda: bool(
get("VLLM_USE_PRECOMPILED")
)
or bool(get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool(
int(
getenv(
"VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL",
"0",
)
)
),
"CMAKE_BUILD_TYPE": lambda: getenv("CMAKE_BUILD_TYPE"),
"VERBOSE": lambda: bool(int(getenv("VERBOSE", "0"))),
"VLLM_CONFIG_ROOT": lambda: expanduser(
getenv(
"VLLM_CONFIG_ROOT",
join(get_default_config_root(), "vllm"),
)
),
"VLLM_CACHE_ROOT": lambda: expanduser(
getenv(
"VLLM_CACHE_ROOT",
join(get_default_cache_root(), "vllm"),
)
),
"VLLM_HOST_IP": lambda: getenv("VLLM_HOST_IP", ""),
"VLLM_PORT": get_vllm_port,
"VLLM_RPC_BASE_PATH": lambda: getenv(
"VLLM_RPC_BASE_PATH", gettempdir()
),
"VLLM_USE_MODELSCOPE": lambda: lower() == "true",
"VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(
get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")
),
"CUDA_HOME": lambda: get("CUDA_HOME", None),
"VLLM_NCCL_SO_PATH": lambda: get(
"VLLM_NCCL_SO_PATH", None
),
"LD_LIBRARY_PATH": lambda: get("LD_LIBRARY_PATH", None),
"VLLM_USE_TRITON_FLASH_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: lower()
in ("true", "1"),
"VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
get("VLLM_FLASH_ATTN_VERSION", None)
),
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool(
get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
!= "0"
),
"VLLM_USE_STANDALONE_COMPILE": lambda: get(
"VLLM_USE_STANDALONE_COMPILE", "1"
)
== "1",
"LOCAL_RANK": lambda: int(get("LOCAL_RANK", "0")),
"CUDA_VISIBLE_DEVICES": lambda: get(
"CUDA_VISIBLE_DEVICES", None
),
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
),
"VLLM_API_KEY": lambda: get("VLLM_API_KEY", None),
"VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: lower()
== "true",
"S3_ACCESS_KEY_ID": lambda: get(
"S3_ACCESS_KEY_ID", None
),
"S3_SECRET_ACCESS_KEY": lambda: get(
"S3_SECRET_ACCESS_KEY", None
),
"S3_ENDPOINT_URL": lambda: get("S3_ENDPOINT_URL", None),
"VLLM_USAGE_STATS_SERVER": lambda: get(
"VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS": lambda: get(
"VLLM_NO_USAGE_STATS", "0"
)
== "1",
"VLLM_DO_NOT_TRACK": lambda: get(
"VLLM_DO_NOT_TRACK", None
)
or get("DO_NOT_TRACK", None)
or "0" == "1",
"VLLM_USAGE_SOURCE": lambda: get(
"VLLM_USAGE_SOURCE", "production"
),
"VLLM_CONFIGURE_LOGGING": lambda: int(
getenv("VLLM_CONFIGURE_LOGGING", "1")
),
"VLLM_LOGGING_CONFIG_PATH": lambda: getenv(
"VLLM_LOGGING_CONFIG_PATH"
),
"VLLM_LOGGING_LEVEL": lambda: upper(),
"VLLM_LOGGING_PREFIX": lambda: getenv(
"VLLM_LOGGING_PREFIX", ""
),
"VLLM_LOGITS_PROCESSOR_THREADS": lambda: int(
getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")
)
if "VLLM_LOGITS_PROCESSOR_THREADS" in environ
else None,
"VLLM_TRACE_FUNCTION": lambda: int(
getenv("VLLM_TRACE_FUNCTION", "0")
),
"VLLM_ATTENTION_BACKEND": lambda: getenv(
"VLLM_ATTENTION_BACKEND", None
),
"VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(
int(environ["VLLM_USE_FLASHINFER_SAMPLER"])
)
if "VLLM_USE_FLASHINFER_SAMPLER" in environ
else None,
"VLLM_FLASHINFER_FORCE_TENSOR_CORES": lambda: bool(
int(
getenv(
"VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"
)
)
),
"VLLM_PP_LAYER_PARTITION": lambda: getenv(
"VLLM_PP_LAYER_PARTITION", None
),
"VLLM_CPU_KVCACHE_SPACE": lambda: int(
getenv("VLLM_CPU_KVCACHE_SPACE", "0")
),
"VLLM_CPU_OMP_THREADS_BIND": lambda: getenv(
"VLLM_CPU_OMP_THREADS_BIND", "auto"
),
"VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int(
getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")
),
"VLLM_CPU_MOE_PREPACK": lambda: bool(
int(getenv("VLLM_CPU_MOE_PREPACK", "1"))
),
"VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
int(getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
int(getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
),
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": lambda: getenv(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"
),
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
int(
getenv(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM",
"0",
)
)
),
"VLLM_WORKER_MULTIPROC_METHOD": lambda: getenv(
"VLLM_WORKER_MULTIPROC_METHOD", "fork"
),
"VLLM_ASSETS_CACHE": lambda: expanduser(
getenv(
"VLLM_ASSETS_CACHE",
join(
get_default_cache_root(), "vllm", "assets"
),
)
),
"VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")
),
"VLLM_VIDEO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")
),
"VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
),
"VLLM_VIDEO_LOADER_BACKEND": lambda: getenv(
"VLLM_VIDEO_LOADER_BACKEND", "opencv"
),
"VLLM_MM_INPUT_CACHE_GIB": lambda: int(
getenv("VLLM_MM_INPUT_CACHE_GIB", "4")
),
"VLLM_XLA_CACHE_PATH": lambda: expanduser(
getenv(
"VLLM_XLA_CACHE_PATH",
join(
get_default_cache_root(),
"vllm",
"xla_cache",
),
)
),
"VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(
int(getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))
),
"VLLM_XLA_USE_SPMD": lambda: bool(
int(getenv("VLLM_XLA_USE_SPMD", "0"))
),
"VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
),
"VLLM_NO_DEPRECATION_WARNING": lambda: bool(
int(getenv("VLLM_NO_DEPRECATION_WARNING", "0"))
),
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool(
getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)
),
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_FP8_MARLIN": lambda: lower()
in ("1", "true"),
"VLLM_TEST_FORCE_LOAD_FORMAT": lambda: getenv(
"VLLM_TEST_FORCE_LOAD_FORMAT", "dummy"
),
"VLLM_RPC_TIMEOUT": lambda: int(
getenv("VLLM_RPC_TIMEOUT", "10000")
),
"VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int(
get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")
),
"VLLM_PLUGINS": lambda: None
if "VLLM_PLUGINS" not in environ
else split(","),
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None
),
"VLLM_TORCH_PROFILER_DIR": lambda: None
if getenv("VLLM_TORCH_PROFILER_DIR", None) is None
else expanduser(getenv("VLLM_TORCH_PROFILER_DIR", ".")),
"VLLM_USE_TRITON_AWQ": lambda: bool(
int(getenv("VLLM_USE_TRITON_AWQ", "0"))
),
"VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: lower()
in ("1", "true"),
"VLLM_SKIP_P2P_CHECK": lambda: getenv(
"VLLM_SKIP_P2P_CHECK", "0"
)
== "1",
"VLLM_DISABLED_KERNELS": lambda: []
if "VLLM_DISABLED_KERNELS" not in environ
else split(","),
"VLLM_USE_V1": lambda: bool(
int(getenv("VLLM_USE_V1", "1"))
),
"VLLM_ROCM_USE_AITER": lambda: lower() in ("true", "1"),
"VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_LINEAR": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MOE": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_RMSNORM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MLA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_AITER_MHA": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_USE_SKINNY_GEMM": lambda: lower()
in ("true", "1"),
"VLLM_ROCM_FP8_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_FP8_PADDING", "1"))
),
"VLLM_ROCM_MOE_PADDING": lambda: bool(
int(getenv("VLLM_ROCM_MOE_PADDING", "1"))
),
"VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: lower()
in ("true", "1"),
"VLLM_QUARK_EMU_MEM_OPT": lambda: bool(
int(getenv("VLLM_QUARK_EMU_MEM_OPT", "0"))
),
"Q_SCALE_CONSTANT": lambda: int(
getenv("Q_SCALE_CONSTANT", "200")
),
"K_SCALE_CONSTANT": lambda: int(
getenv("K_SCALE_CONSTANT", "200")
),
"V_SCALE_CONSTANT": lambda: int(
getenv("V_SCALE_CONSTANT", "100")
),
"VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(
int(getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))
),
"VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
),
"VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
int(getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
),
"VLLM_SERVER_DEV_MODE": lambda: bool(
int(getenv("VLLM_SERVER_DEV_MODE", "0"))
),
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int(
getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")
),
"VLLM_MLA_DISABLE": lambda: bool(
int(getenv("VLLM_MLA_DISABLE", "0"))
),
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": lambda: bool(
int(
getenv(
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON",
"0",
)
)
),
"VLLM_RAY_PER_WORKER_GPUS": lambda: float(
getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")
),
"VLLM_RAY_BUNDLE_INDICES": lambda: getenv(
"VLLM_RAY_BUNDLE_INDICES", ""
),
"VLLM_CUDART_SO_PATH": lambda: getenv(
"VLLM_CUDART_SO_PATH", None
),
"VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH": lambda: lower()
in ("1", "true"),
"VLLM_HPU_USE_DELAYED_SAMPLING": lambda: lower()
in ("1", "true"),
"VLLM_DP_RANK": lambda: int(
getenv("VLLM_DP_RANK", "0")
),
"VLLM_DP_RANK_LOCAL": lambda: int(
getenv("VLLM_DP_RANK_LOCAL", VLLM_DP_RANK)
),
"VLLM_DP_SIZE": lambda: int(
getenv("VLLM_DP_SIZE", "1")
),
"VLLM_DP_MASTER_IP": lambda: getenv(
"VLLM_DP_MASTER_IP", "127.0.0.1"
),
"VLLM_DP_MASTER_PORT": lambda: int(
getenv("VLLM_DP_MASTER_PORT", "0")
),
"VLLM_MOE_DP_CHUNK_SIZE": lambda: int(
getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")
),
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: get(
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
)
== "1",
"VLLM_CI_USE_S3": lambda: get("VLLM_CI_USE_S3", "0")
== "1",
"VLLM_MODEL_REDIRECT_PATH": lambda: get(
"VLLM_MODEL_REDIRECT_PATH", None
),
"VLLM_MARLIN_USE_ATOMIC_ADD": lambda: get(
"VLLM_MARLIN_USE_ATOMIC_ADD", "0"
)
== "1",
"VLLM_V0_USE_OUTLINES_CACHE": lambda: get(
"VLLM_V0_USE_OUTLINES_CACHE", "0"
)
== "1",
"VLLM_TPU_BUCKET_PADDING_GAP": lambda: int(
environ["VLLM_TPU_BUCKET_PADDING_GAP"]
)
if "VLLM_TPU_BUCKET_PADDING_GAP" in environ
else 0,
"VLLM_USE_DEEP_GEMM": lambda: bool(
int(getenv("VLLM_USE_DEEP_GEMM", "0"))
),
"VLLM_XGRAMMAR_CACHE_MB": lambda: int(
getenv("VLLM_XGRAMMAR_CACHE_MB", "512")
),
"VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int(
getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")
),
"VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool(
int(
getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")
)
),
"VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: getenv(
"VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost"
),
"VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int(
getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")
),
"VLLM_ALL2ALL_BACKEND": lambda: getenv(
"VLLM_ALL2ALL_BACKEND", "naive"
),
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(
getenv(
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840"
)
),
"VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int(
getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")
),
"VLLM_SLEEP_WHEN_IDLE": lambda: bool(
int(getenv("VLLM_SLEEP_WHEN_IDLE", "0"))
),
"VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(
getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")
),
"VLLM_KV_CACHE_LAYOUT": lambda: getenv(
"VLLM_KV_CACHE_LAYOUT", None
),
"VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool(
int(getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))
),
}
__dir__
¶
compute_hash
¶
compute_hash() -> str
WARNING: Whenever a new key is added to this environment variables, ensure that it is included in the factors list if it affects the computation graph. For example, different values of VLLM_PP_LAYER_PARTITION will generate different computation graphs, so it is included in the factors list. The env vars that affect the choice of different kernels or attention backends should also be included in the factors list.
Source code in vllm/envs.py
get_default_cache_root
¶
get_default_config_root
¶
get_vllm_port
¶
Get the port from VLLM_PORT environment variable.
Returns:
Type | Description |
---|---|
Optional[int]
|
The port number as an integer if VLLM_PORT is set, None otherwise. |
Raises:
Type | Description |
---|---|
ValueError
|
If VLLM_PORT is a URI, suggest k8s service discovery issue. |