Environment Variables¶

SenDNN Inference uses the following environment variables to configure the system:
environment_variables: dict[str, Callable[[], Any]] = {
    # Defines the prompt lengths the Spyre accelerator should be prepared
    # for, formatted as comma separated list. Only applicable in pooling.
    "SENDNN_INFERENCE_WARMUP_PROMPT_LENS": lambda: [
        int(p)
        for p in os.getenv(key="SENDNN_INFERENCE_WARMUP_PROMPT_LENS", default="512").split(",")
    ],
    # Defines the batch sizes the Spyre accelerator should be prepared
    # for, formatted as comma separated list. Only applicable in pooling.
    "SENDNN_INFERENCE_WARMUP_BATCH_SIZES": lambda: [
        int(b) for b in os.getenv(key="SENDNN_INFERENCE_WARMUP_BATCH_SIZES", default="8").split(",")
    ],
    # Defines the backend that torch.compile will use when using Spyre
    # Available options:
    # - "sendnn": Compile for execution on Spyre hardware
    # - "inductor": Compile for execution on CPU (for debug and testing)
    # - "eager": Skip compile entirely (for debug and testing)
    #
    "SENDNN_INFERENCE_DYNAMO_BACKEND": lambda: os.getenv(
        "SENDNN_INFERENCE_DYNAMO_BACKEND", "sendnn"
    ),
    # Enable performance metric logging. This captures startup information
    # such as warmup times, and loading times.
    # When `--disable-log-stats=False` is used, this will log timing metrics
    # about every finished request into a .jsonl file. These are the same
    # metrics that are available in prometheus format on the /metrics endpoint,
    # but it is sometime helpful to view them disaggregated to debug performance
    # problems. This logging is not designed to be performant, and should not be
    # enabled in production settings.
    # It is turned off by default.
    "SENDNN_INFERENCE_PERF_METRIC_LOGGING_ENABLED": lambda: int(
        os.getenv("SENDNN_INFERENCE_PERF_METRIC_LOGGING_ENABLED", 0)
    ),
    # Directory to write performance metric logging files. By default,
    # logs are written to /tmp.
    "SENDNN_INFERENCE_PERF_METRIC_LOGGING_DIR": lambda: os.getenv(
        "SENDNN_INFERENCE_PERF_METRIC_LOGGING_DIR", "/tmp"
    ),
    # If set, override the signal handler for sendnn-inference on
    # vLLM V1 + torch_sendnn backend to be able to gracefully
    # shutdown the engine.
    "SENDNN_INFERENCE_OVERRIDE_SIGNALS_HANDLER": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_OVERRIDE_SIGNALS_HANDLER", "1"))
    ),
    # Allow sendnn-inference to update env vars related to multi-threading (eg. OMP)
    # based on the detected CPU cores and server configuration
    # Multimodal models will not take into account the number of workers for configuration.
    "SENDNN_INFERENCE_UPDATE_THREAD_CONFIG": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_UPDATE_THREAD_CONFIG", "1"))
    ),
    # If set, limit the number of concurrent processes loading/compiling
    # large models or models with larger context lengths to limit
    # memory usage.
    # Set to 0 to allow any number of processes
    "SENDNN_INFERENCE_MAX_LOAD_PROCESSES": lambda: int(
        os.getenv("SENDNN_INFERENCE_MAX_LOAD_PROCESSES", "0")
    ),
    # If set, redirects all stdout and stderr from worker processes to files
    # within this director. This is useful for debugging card-specific errors
    # in multi-AIU setups, but should never be enabled in production settings.
    # This removes all output from stdout and stderr for the worker processes.
    "SENDNN_INFERENCE_WORKER_LOG_REDIRECT_DIR": lambda: os.getenv(
        "SENDNN_INFERENCE_WORKER_LOG_REDIRECT_DIR", ""
    ),
    # If set, overrides the default (30 minutes) timeout for
    #  torch.distributed.init_process_group
    "SENDNN_INFERENCE_GLOO_TIMEOUT_MINUTES": lambda: int(
        os.getenv("SENDNN_INFERENCE_GLOO_TIMEOUT_MINUTES", "60")
    ),
    # If set, this will require use of pre-compiled models and
    # disable compilation for decoders
    "SENDNN_INFERENCE_REQUIRE_PRECOMPILED_DECODERS": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_REQUIRE_PRECOMPILED_DECODERS", "0"))
    ),
    # Simple compile backend for some dynamically compiled operations, like
    # gathering logprobs in the sampler.
    # Defaults to eager, iductor can be used if python headers and a compiler
    # are available.
    "SENDNN_INFERENCE_SIMPLE_COMPILE_BACKEND": lambda: os.getenv(
        "SENDNN_INFERENCE_SIMPLE_COMPILE_BACKEND", "inductor"
    ),
    # Configures the number of CPUs used when determining multi-threading
    # configurations
    # Set to 0 to have sendnn-inference attempt to detect the CPU count
    "SENDNN_INFERENCE_NUM_CPUS": lambda: int(os.getenv("SENDNN_INFERENCE_NUM_CPUS", "0")),
    # Feature Flag
    # Works only with chunked prefill enabled. If set, prefill steps are
    # interleaved with a decode step
    "SENDNN_INFERENCE_CP_INTERLEAVE_STEPS": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_CP_INTERLEAVE_STEPS", "1"))
    ),
    # If set, raises a runtime error if the model configuration is not found
    # in the known configurations registry. Only applies when running on
    # Spyre device (sendnn backend).
    "SENDNN_INFERENCE_REQUIRE_KNOWN_CONFIG": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_REQUIRE_KNOWN_CONFIG", "0"))
    ),
    # Path to custom model_configs.yaml file. If not set, uses the default
    # location at sendnn_inference/config/model_configs.yaml
    "SENDNN_INFERENCE_MODEL_CONFIG_FILE": lambda: os.getenv("SENDNN_INFERENCE_MODEL_CONFIG_FILE"),
    # Dtype for multimodal vision_tower / multi_modal_projector params (CPU).
    # One of "float32" | "float16" | "bfloat16"; default per platform.
    "SENDNN_INFERENCE_CPU_MM_DTYPE": lambda: parse_cpu_mm_dtype(
        os.getenv(
            "SENDNN_INFERENCE_CPU_MM_DTYPE",
            _CPU_MM_DTYPE_PLATFORM_DEFAULTS.get(platform.machine(), "float16"),
        )
    ),
    # Device used to execute the multimodal vision_tower / multi_modal_projector.
    # "auto" (default) routes the encoder to the Telum through the torch_nnpa
    # privateuse1 backend when torch_nnpa is importable, and falls back silently
    # to CPU only when torch_nnpa is *absent*. "cpu" forces CPU execution. "nnpa"
    # additionally raises ImportError at startup if torch_nnpa is missing. This
    # setting only resolves intent; torch_nnpa is not imported here. The
    # privateuse1 backend is registered lazily -- and only for multimodal models
    # -- at vision-weight placement via utils.ensure_nnpa_registered(), which
    # also registers the PrivateUse1HooksInterface that PyTorch autoload would
    # normally provide (this plugin sets TORCH_DEVICE_BACKEND_AUTOLOAD=0 in
    # __init__ to keep that autoload from crashing worker startup). If torch_nnpa
    # is present but the nnpa device cannot be initialized, vLLM fails to start
    # (no silent CPU fallback) -- set this to "cpu" to run the vision tower on
    # CPU. The LLM forward continues to run on Spyre via
    # torch.compile(backend="sendnn") regardless.
    "SENDNN_INFERENCE_MM_DEVICE": lambda: parse_mm_device(
        os.getenv("SENDNN_INFERENCE_MM_DEVICE", "auto")
    ),
    # Enable the async vision encoder subprocess.
    # When set to 1, SpyreMultiprocExecutor spawns a separate process that loads
    # only the vision model via get_model(..., vision_only=True) and pre-encodes
    # MM requests in parallel with AIU prefill/decode.  The scheduler gates MM
    # request prefill on encoding readiness so a request only starts prefill once
    # its embedding is available.  Only effective for decoder models with TP > 1.
    # Defaults to 0 (disabled) — uses the Phase 1 blocking encode path.
    "SENDNN_INFERENCE_ASYNC_MM_ENCODER": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_ASYNC_MM_ENCODER", "1"))
    ),
    # When "1" (default), rank 0 runs the vision encoder and shares the result
    # with other TP ranks via POSIX shared memory (one encoder call instead of
    # world_size calls).  Set to "0" to fall back to every TP rank running the
    # vision encoder independently — the original behaviour, which avoids any
    # SHM-related failure modes at the cost of redundant CPU work.
    "SENDNN_INFERENCE_TP_MM_SHARING": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_TP_MM_SHARING", "1"))
    ),
    # When "0" (default) and when there are paused requests, the request with
    # the shortest current output is prioritized when both request have been
    # paused for the same amount of time. Setting this to 0 will prevent a few
    # requests from having a very high E2E latency, but at the cost of other
    # metrics like throughput, mean TTFT and mean ITL.
    "SENDNN_INFERENCE_LONG_OUT_PRIO": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_LONG_OUT_PRIO", "0"))
    ),
    # When "1" (default), all requests can be scheduled as long as there are
    # enough KV-cache blocks for the prompt tokens and max output tokens.
    # If the TKV constraints are about to be exceeded, requests are removed
    # from the decode batch. At each iteration the set of running requests
    # is rotated for fairness.
    "SENDNN_INFERENCE_PAUSING_ENABLED": lambda: bool(
        int(os.getenv("SENDNN_INFERENCE_PAUSING_ENABLED", "1"))
    ),
}