Environment Variables¶
SenDNN Inference uses the following environment variables to configure the system:
environment_variables: dict[str, Callable[[], Any]] = {
# Defines the prompt lengths the Spyre accelerator should be prepared
# for, formatted as comma separated list. Only applicable in pooling.
"SENDNN_INFERENCE_WARMUP_PROMPT_LENS": lambda: [
int(p)
for p in os.getenv(key="SENDNN_INFERENCE_WARMUP_PROMPT_LENS", default="64").split(",")
],
# Defines the batch sizes the Spyre accelerator should be prepared
# for, formatted as comma separated list. Only applicable in pooling.
"SENDNN_INFERENCE_WARMUP_BATCH_SIZES": lambda: [
int(b) for b in os.getenv(key="SENDNN_INFERENCE_WARMUP_BATCH_SIZES", default="1").split(",")
],
# Defines the backend that torch.compile will use when using Spyre
# Available options:
# - "sendnn": Compile for execution on Spyre hardware
# - "inductor": Compile for execution on CPU (for debug and testing)
# - "eager": Skip compile entirely (for debug and testing)
#
"SENDNN_INFERENCE_DYNAMO_BACKEND": lambda: os.getenv(
"SENDNN_INFERENCE_DYNAMO_BACKEND", "sendnn"
),
# Enable performance metric logging. This captures startup information
# such as warmup times, and loading times.
# When `--disable-log-stats=False` is used, this will log timing metrics
# about every finished request into a .jsonl file. These are the same
# metrics that are available in prometheus format on the /metrics endpoint,
# but it is sometime helpful to view them disaggregated to debug performance
# problems. This logging is not designed to be performant, and should not be
# enabled in production settings.
# It is turned off by default.
"SENDNN_INFERENCE_PERF_METRIC_LOGGING_ENABLED": lambda: int(
os.getenv("SENDNN_INFERENCE_PERF_METRIC_LOGGING_ENABLED", 0)
),
# Directory to write performance metric logging files. By default,
# logs are written to /tmp.
"SENDNN_INFERENCE_PERF_METRIC_LOGGING_DIR": lambda: os.getenv(
"SENDNN_INFERENCE_PERF_METRIC_LOGGING_DIR", "/tmp"
),
# If set, override the signal handler for sendnn-inference on
# vLLM V1 + torch_sendnn backend to be able to gracefully
# shutdown the engine.
"SENDNN_INFERENCE_OVERRIDE_SIGNALS_HANDLER": lambda: bool(
int(os.getenv("SENDNN_INFERENCE_OVERRIDE_SIGNALS_HANDLER", "1"))
),
# Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
# based on the detected CPU cores and server configuration
"SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
),
# If set, limit the number of concurrent processes loading/compiling
# large models or models with larger context lengths to limit
# memory usage.
# Set to 0 to allow any number of processes
"SENDNN_INFERENCE_MAX_LOAD_PROCESSES": lambda: int(
os.getenv("SENDNN_INFERENCE_MAX_LOAD_PROCESSES", "0")
),
# If set, redirects all stdout and stderr from worker processes to files
# within this director. This is useful for debugging card-specific errors
# in multi-AIU setups, but should never be enabled in production settings.
# This removes all output from stdout and stderr for the worker processes.
"SENDNN_INFERENCE_WORKER_LOG_REDIRECT_DIR": lambda: os.getenv(
"SENDNN_INFERENCE_WORKER_LOG_REDIRECT_DIR", ""
),
# If set, overrides the default (30 minutes) timeout for
# torch.distributed.init_process_group
"SENDNN_INFERENCE_GLOO_TIMEOUT_MINUTES": lambda: int(
os.getenv("SENDNN_INFERENCE_GLOO_TIMEOUT_MINUTES", "60")
),
# If set, this will require use of pre-compiled models and
# disable compilation for decoders
"SENDNN_INFERENCE_REQUIRE_PRECOMPILED_DECODERS": lambda: bool(
int(os.getenv("SENDNN_INFERENCE_REQUIRE_PRECOMPILED_DECODERS", "0"))
),
# Simple compile backend for some dynamically compiled operations, like
# gathering logprobs in the sampler.
# Defaults to eager, iductor can be used if python headers and a compiler
# are available.
"SENDNN_INFERENCE_SIMPLE_COMPILE_BACKEND": lambda: os.getenv(
"SENDNN_INFERENCE_SIMPLE_COMPILE_BACKEND", "inductor"
),
# Configures the number of CPUs used when determining multi-threading
# configurations
# Set to 0 to have sendnn-inference attempt to detect the CPU count
"SENDNN_INFERENCE_NUM_CPUS": lambda: int(os.getenv("SENDNN_INFERENCE_NUM_CPUS", "0")),
# Feature Flag
# Works only with chunked prefill enabled. If set, prefill steps are
# interleaved with a decode step
"SENDNN_INFERENCE_CP_INTERLEAVE_STEPS": lambda: bool(
int(os.getenv("SENDNN_INFERENCE_CP_INTERLEAVE_STEPS", "1"))
),
# If set, raises a runtime error if the model configuration is not found
# in the known configurations registry. Only applies when running on
# Spyre device (sendnn backend).
"SENDNN_INFERENCE_REQUIRE_KNOWN_CONFIG": lambda: bool(
int(os.getenv("SENDNN_INFERENCE_REQUIRE_KNOWN_CONFIG", "0"))
),
# Path to custom model_configs.yaml file. If not set, uses the default
# location at sendnn_inference/config/model_configs.yaml
"SENDNN_INFERENCE_MODEL_CONFIG_FILE": lambda: os.getenv("SENDNN_INFERENCE_MODEL_CONFIG_FILE"),
# Dtype for multimodal vision_tower / multi_modal_projector params (CPU).
# One of "float32" | "float16" | "bfloat16"; default per platform.
"SENDNN_INFERENCE_CPU_MM_DTYPE": lambda: parse_cpu_mm_dtype(
os.getenv(
"SENDNN_INFERENCE_CPU_MM_DTYPE",
_CPU_MM_DTYPE_PLATFORM_DEFAULTS.get(platform.machine(), "float16"),
)
),
# Device used to execute the multimodal vision_tower / multi_modal_projector.
# "auto" (default) routes the encoder to the Telum through the torch_nnpa
# privateuse1 backend when torch_nnpa is importable, and falls back silently
# to CPU only when torch_nnpa is *absent*. "cpu" forces CPU execution. "nnpa"
# additionally raises ImportError at startup if torch_nnpa is missing. This
# setting only resolves intent; torch_nnpa is not imported here. The
# privateuse1 backend is registered lazily -- and only for multimodal models
# -- at vision-weight placement via utils.ensure_nnpa_registered(), which
# also registers the PrivateUse1HooksInterface that PyTorch autoload would
# normally provide (this plugin sets TORCH_DEVICE_BACKEND_AUTOLOAD=0 in
# __init__ to keep that autoload from crashing worker startup). If torch_nnpa
# is present but the nnpa device cannot be initialized, vLLM fails to start
# (no silent CPU fallback) -- set this to "cpu" to run the vision tower on
# CPU. The LLM forward continues to run on Spyre via
# torch.compile(backend="sendnn") regardless.
"SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
),
}