vllm
#
vLLM: a high-throughput and memory-efficient inference engine for LLMs
Subpackages#
vllm.adapter_commons
vllm.assets
vllm.attention
vllm.attention.backends
vllm.attention.backends.mla
vllm.attention.backends.abstract
vllm.attention.backends.blocksparse_attn
vllm.attention.backends.cpu_mla
vllm.attention.backends.dual_chunk_flash_attn
vllm.attention.backends.flash_attn
vllm.attention.backends.flashinfer
vllm.attention.backends.flashmla
vllm.attention.backends.hpu_attn
vllm.attention.backends.ipex_attn
vllm.attention.backends.pallas
vllm.attention.backends.placeholder_attn
vllm.attention.backends.rocm_aiter_mla
vllm.attention.backends.rocm_flash_attn
vllm.attention.backends.torch_sdpa
vllm.attention.backends.triton_mla
vllm.attention.backends.utils
vllm.attention.backends.xformers
vllm.attention.ops
vllm.attention.ops.blocksparse_attention
vllm.attention.ops.chunked_prefill_paged_decode
vllm.attention.ops.flashmla
vllm.attention.ops.hpu_paged_attn
vllm.attention.ops.ipex_attn
vllm.attention.ops.merge_attn_states
vllm.attention.ops.nki_flash_attn
vllm.attention.ops.paged_attn
vllm.attention.ops.prefix_prefill
vllm.attention.ops.rocm_aiter_mla
vllm.attention.ops.rocm_aiter_paged_attn
vllm.attention.ops.triton_decode_attention
vllm.attention.ops.triton_flash_attention
- Fused Attention
vllm.attention.ops.triton_merge_attn_states
vllm.attention.ops.triton_unified_attention
vllm.attention.layer
vllm.attention.selector
vllm.benchmarks
vllm.compilation
vllm.compilation.activation_quant_fusion
vllm.compilation.backends
vllm.compilation.compiler_interface
vllm.compilation.counter
vllm.compilation.decorators
vllm.compilation.fix_functionalization
vllm.compilation.fusion
vllm.compilation.fx_utils
vllm.compilation.inductor_pass
vllm.compilation.monitor
vllm.compilation.multi_output_match
vllm.compilation.noop_elimination
vllm.compilation.pass_manager
vllm.compilation.sequence_parallelism
vllm.compilation.torch25_custom_graph_pass
vllm.compilation.vllm_inductor_pass
vllm.compilation.wrapper
vllm.core
vllm.device_allocator
vllm.distributed
vllm.distributed.device_communicators
vllm.distributed.device_communicators.all2all
vllm.distributed.device_communicators.base_device_communicator
vllm.distributed.device_communicators.cpu_communicator
vllm.distributed.device_communicators.cuda_communicator
vllm.distributed.device_communicators.cuda_wrapper
vllm.distributed.device_communicators.custom_all_reduce
vllm.distributed.device_communicators.custom_all_reduce_utils
vllm.distributed.device_communicators.hpu_communicator
vllm.distributed.device_communicators.neuron_communicator
vllm.distributed.device_communicators.pynccl
vllm.distributed.device_communicators.pynccl_wrapper
vllm.distributed.device_communicators.shm_broadcast
vllm.distributed.device_communicators.tpu_communicator
vllm.distributed.device_communicators.xpu_communicator
vllm.distributed.kv_transfer
vllm.distributed.communication_op
vllm.distributed.kv_events
vllm.distributed.parallel_state
vllm.distributed.utils
vllm.engine
vllm.entrypoints
vllm.entrypoints.cli
vllm.entrypoints.openai
vllm.entrypoints.openai.tool_parsers
vllm.entrypoints.openai.api_server
vllm.entrypoints.openai.cli_args
vllm.entrypoints.openai.logits_processors
vllm.entrypoints.openai.protocol
vllm.entrypoints.openai.run_batch
vllm.entrypoints.openai.serving_chat
vllm.entrypoints.openai.serving_classification
vllm.entrypoints.openai.serving_completion
vllm.entrypoints.openai.serving_embedding
vllm.entrypoints.openai.serving_engine
vllm.entrypoints.openai.serving_models
vllm.entrypoints.openai.serving_pooling
vllm.entrypoints.openai.serving_score
vllm.entrypoints.openai.serving_tokenization
vllm.entrypoints.openai.serving_transcription
vllm.entrypoints.api_server
vllm.entrypoints.chat_utils
vllm.entrypoints.launcher
vllm.entrypoints.llm
vllm.entrypoints.logger
vllm.entrypoints.score_utils
vllm.entrypoints.ssl
vllm.entrypoints.utils
vllm.executor
vllm.inputs
vllm.logging_utils
vllm.lora
vllm.model_executor
vllm.model_executor.guided_decoding
vllm.model_executor.guided_decoding.guidance_decoding
vllm.model_executor.guided_decoding.guidance_logits_processors
vllm.model_executor.guided_decoding.guided_fields
vllm.model_executor.guided_decoding.lm_format_enforcer_decoding
vllm.model_executor.guided_decoding.outlines_decoding
vllm.model_executor.guided_decoding.outlines_logits_processors
vllm.model_executor.guided_decoding.utils
vllm.model_executor.guided_decoding.xgrammar_decoding
vllm.model_executor.layers
vllm.model_executor.layers.fused_moe
vllm.model_executor.layers.mamba
vllm.model_executor.layers.quantization
vllm.model_executor.layers.activation
vllm.model_executor.layers.layernorm
vllm.model_executor.layers.lightning_attn
vllm.model_executor.layers.linear
vllm.model_executor.layers.logits_processor
vllm.model_executor.layers.pooler
vllm.model_executor.layers.rejection_sampler
vllm.model_executor.layers.resampler
vllm.model_executor.layers.rotary_embedding
vllm.model_executor.layers.sampler
vllm.model_executor.layers.spec_decode_base_sampler
vllm.model_executor.layers.typical_acceptance_sampler
vllm.model_executor.layers.utils
vllm.model_executor.layers.vocab_parallel_embedding
vllm.model_executor.model_loader
vllm.model_executor.model_loader.base_loader
vllm.model_executor.model_loader.bitsandbytes_loader
vllm.model_executor.model_loader.default_loader
vllm.model_executor.model_loader.dummy_loader
vllm.model_executor.model_loader.gguf_loader
vllm.model_executor.model_loader.neuron
vllm.model_executor.model_loader.neuronx_distributed
vllm.model_executor.model_loader.runai_streamer_loader
vllm.model_executor.model_loader.sharded_state_loader
vllm.model_executor.model_loader.tensorizer
vllm.model_executor.model_loader.tensorizer_loader
vllm.model_executor.model_loader.utils
vllm.model_executor.model_loader.weight_utils
vllm.model_executor.models
vllm.model_executor.models.adapters
vllm.model_executor.models.aimv2
vllm.model_executor.models.arctic
vllm.model_executor.models.aria
vllm.model_executor.models.aya_vision
vllm.model_executor.models.baichuan
vllm.model_executor.models.bamba
vllm.model_executor.models.bart
vllm.model_executor.models.bert
vllm.model_executor.models.bert_with_rope
vllm.model_executor.models.blip
vllm.model_executor.models.blip2
vllm.model_executor.models.bloom
vllm.model_executor.models.chameleon
vllm.model_executor.models.chatglm
vllm.model_executor.models.clip
vllm.model_executor.models.commandr
vllm.model_executor.models.constant_size_cache
vllm.model_executor.models.dbrx
vllm.model_executor.models.deepseek
vllm.model_executor.models.deepseek_mtp
vllm.model_executor.models.deepseek_v2
vllm.model_executor.models.deepseek_vl2
vllm.model_executor.models.eagle
vllm.model_executor.models.exaone
vllm.model_executor.models.fairseq2_llama
vllm.model_executor.models.falcon
vllm.model_executor.models.florence2
vllm.model_executor.models.fuyu
vllm.model_executor.models.gemma
vllm.model_executor.models.gemma2
vllm.model_executor.models.gemma3
vllm.model_executor.models.gemma3_mm
vllm.model_executor.models.glm
vllm.model_executor.models.glm4
vllm.model_executor.models.glm4v
vllm.model_executor.models.gpt2
vllm.model_executor.models.gpt_bigcode
vllm.model_executor.models.gpt_j
vllm.model_executor.models.gpt_neox
vllm.model_executor.models.granite
vllm.model_executor.models.granite_speech
vllm.model_executor.models.granitemoe
vllm.model_executor.models.granitemoehybrid
vllm.model_executor.models.granitemoeshared
vllm.model_executor.models.gritlm
vllm.model_executor.models.grok1
vllm.model_executor.models.h2ovl
vllm.model_executor.models.idefics2_vision_model
vllm.model_executor.models.idefics3
vllm.model_executor.models.interfaces
vllm.model_executor.models.interfaces_base
vllm.model_executor.models.intern_vit
vllm.model_executor.models.internlm2
vllm.model_executor.models.internlm2_ve
vllm.model_executor.models.internvl
vllm.model_executor.models.jais
vllm.model_executor.models.jamba
vllm.model_executor.models.kimi_vl
vllm.model_executor.models.llama
vllm.model_executor.models.llama4
vllm.model_executor.models.llama_eagle
vllm.model_executor.models.llama_eagle3
vllm.model_executor.models.llava
vllm.model_executor.models.llava_next
vllm.model_executor.models.llava_next_video
vllm.model_executor.models.llava_onevision
vllm.model_executor.models.mamba
vllm.model_executor.models.mamba2
vllm.model_executor.models.mamba_cache
vllm.model_executor.models.medusa
vllm.model_executor.models.mimo
vllm.model_executor.models.mimo_mtp
vllm.model_executor.models.minicpm
vllm.model_executor.models.minicpm3
vllm.model_executor.models.minicpmo
vllm.model_executor.models.minicpmv
vllm.model_executor.models.minimax_cache
vllm.model_executor.models.minimax_text_01
vllm.model_executor.models.minimax_vl_01
vllm.model_executor.models.mistral3
vllm.model_executor.models.mixtral
vllm.model_executor.models.mixtral_quant
vllm.model_executor.models.mllama
vllm.model_executor.models.mllama4
vllm.model_executor.models.mlp_speculator
vllm.model_executor.models.modernbert
vllm.model_executor.models.module_mapping
vllm.model_executor.models.molmo
vllm.model_executor.models.moonvit
vllm.model_executor.models.mpt
vllm.model_executor.models.nemotron
vllm.model_executor.models.nemotron_nas
vllm.model_executor.models.nvlm_d
vllm.model_executor.models.olmo
vllm.model_executor.models.olmo2
vllm.model_executor.models.olmoe
vllm.model_executor.models.opt
vllm.model_executor.models.orion
vllm.model_executor.models.ovis
vllm.model_executor.models.paligemma
vllm.model_executor.models.persimmon
vllm.model_executor.models.phi
vllm.model_executor.models.phi3
vllm.model_executor.models.phi3_small
vllm.model_executor.models.phi3v
vllm.model_executor.models.phi4mm
vllm.model_executor.models.phi4mm_audio
vllm.model_executor.models.phi4mm_utils
vllm.model_executor.models.phimoe
vllm.model_executor.models.pixtral
vllm.model_executor.models.plamo2
vllm.model_executor.models.prithvi_geospatial_mae
vllm.model_executor.models.qwen
vllm.model_executor.models.qwen2
vllm.model_executor.models.qwen2_5_omni_thinker
vllm.model_executor.models.qwen2_5_vl
vllm.model_executor.models.qwen2_audio
vllm.model_executor.models.qwen2_moe
vllm.model_executor.models.qwen2_rm
vllm.model_executor.models.qwen2_vl
vllm.model_executor.models.qwen3
vllm.model_executor.models.qwen3_moe
vllm.model_executor.models.qwen_vl
vllm.model_executor.models.registry
vllm.model_executor.models.roberta
vllm.model_executor.models.siglip
vllm.model_executor.models.skyworkr1v
vllm.model_executor.models.smolvlm
vllm.model_executor.models.solar
vllm.model_executor.models.stablelm
vllm.model_executor.models.starcoder2
vllm.model_executor.models.telechat2
vllm.model_executor.models.teleflm
vllm.model_executor.models.transformers
vllm.model_executor.models.ultravox
vllm.model_executor.models.utils
vllm.model_executor.models.vision
vllm.model_executor.models.whisper
vllm.model_executor.models.zamba2
vllm.model_executor.custom_op
vllm.model_executor.parameter
vllm.model_executor.pooling_metadata
vllm.model_executor.sampling_metadata
vllm.model_executor.utils
vllm.multimodal
vllm.platforms
vllm.plugins
vllm.profiler
vllm.prompt_adapter
vllm.reasoning
vllm.spec_decode
vllm.spec_decode.batch_expansion
vllm.spec_decode.draft_model_runner
vllm.spec_decode.interfaces
vllm.spec_decode.medusa_worker
vllm.spec_decode.metrics
vllm.spec_decode.mlp_speculator_worker
vllm.spec_decode.mqa_scorer
vllm.spec_decode.multi_step_worker
vllm.spec_decode.ngram_worker
vllm.spec_decode.proposer_worker_base
vllm.spec_decode.smaller_tp_proposer_worker
vllm.spec_decode.spec_decode_worker
vllm.spec_decode.target_model_runner
vllm.spec_decode.top1_proposer
vllm.spec_decode.util
vllm.transformers_utils
vllm.transformers_utils.chat_templates
vllm.transformers_utils.configs
vllm.transformers_utils.configs.arctic
vllm.transformers_utils.configs.chatglm
vllm.transformers_utils.configs.cohere2
vllm.transformers_utils.configs.dbrx
vllm.transformers_utils.configs.deepseek_vl2
vllm.transformers_utils.configs.eagle
vllm.transformers_utils.configs.exaone
vllm.transformers_utils.configs.falcon
vllm.transformers_utils.configs.h2ovl
vllm.transformers_utils.configs.internvl
vllm.transformers_utils.configs.jais
vllm.transformers_utils.configs.kimi_vl
vllm.transformers_utils.configs.medusa
vllm.transformers_utils.configs.minimax_text_01
vllm.transformers_utils.configs.minimax_vl_01
vllm.transformers_utils.configs.mllama
vllm.transformers_utils.configs.mlp_speculator
vllm.transformers_utils.configs.moonvit
vllm.transformers_utils.configs.mpt
vllm.transformers_utils.configs.nemotron
vllm.transformers_utils.configs.nvlm_d
vllm.transformers_utils.configs.ovis
vllm.transformers_utils.configs.skyworkr1v
vllm.transformers_utils.configs.solar
vllm.transformers_utils.configs.telechat2
vllm.transformers_utils.configs.ultravox
vllm.transformers_utils.processors
vllm.transformers_utils.tokenizers
vllm.transformers_utils.config
vllm.transformers_utils.detokenizer
vllm.transformers_utils.detokenizer_utils
vllm.transformers_utils.processor
vllm.transformers_utils.s3_utils
vllm.transformers_utils.tokenizer
vllm.transformers_utils.tokenizer_base
vllm.transformers_utils.tokenizer_group
vllm.transformers_utils.utils
vllm.triton_utils
vllm.usage
vllm.v1
vllm.worker
vllm.worker.cache_engine
vllm.worker.cpu_enc_dec_model_runner
vllm.worker.cpu_model_runner
vllm.worker.cpu_pooling_model_runner
vllm.worker.cpu_worker
vllm.worker.enc_dec_model_runner
vllm.worker.hpu_model_runner
vllm.worker.hpu_worker
vllm.worker.model_runner
vllm.worker.model_runner_base
vllm.worker.multi_step_hpu_worker
vllm.worker.multi_step_model_runner
vllm.worker.multi_step_neuron_model_runner
vllm.worker.multi_step_neuronx_distributed_model_runner
vllm.worker.multi_step_tpu_worker
vllm.worker.multi_step_worker
vllm.worker.neuron_model_runner
vllm.worker.neuron_worker
vllm.worker.neuronx_distributed_model_runner
vllm.worker.pooling_model_runner
vllm.worker.tpu_model_runner
vllm.worker.tpu_worker
vllm.worker.utils
vllm.worker.worker
vllm.worker.worker_base
vllm.worker.xpu_model_runner
vllm.worker.xpu_worker
Submodules#
vllm.beam_search
vllm.collect_env
vllm.config
vllm.connections
vllm.env_override
vllm.envs
vllm.forward_context
vllm.jsontree
vllm.logger
vllm.logits_process
vllm.outputs
vllm.pooling_params
vllm.sampling_params
vllm.scalar_type
vllm.scripts
vllm.sequence
vllm.test_utils
vllm.tracing
vllm.utils
vllm.version