vllm.benchmarks.endpoint_request_func

The request function for API endpoints.

AIOHTTP_TIMEOUT `module-attribute` ¶

AIOHTTP_TIMEOUT = ClientTimeout(total=6 * 60 * 60)

ASYNC_REQUEST_FUNCS `module-attribute` ¶

ASYNC_REQUEST_FUNCS = {
    "openai-comp": async_request_openai_completions
}

RequestFuncInput `dataclass` ¶

The input for the request function.

Source code in vllm/benchmarks/endpoint_request_func.py

@dataclass
class RequestFuncInput:
    """The input for the request function."""
    prompt: str
    api_url: str
    prompt_len: int
    output_len: int
    model: str
    model_name: Optional[str] = None
    best_of: int = 1
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False

api_url `instance-attribute` ¶

api_url: str

best_of `class-attribute` `instance-attribute` ¶

best_of: int = 1

extra_body `class-attribute` `instance-attribute` ¶

extra_body: Optional[dict] = None

ignore_eos `class-attribute` `instance-attribute` ¶

ignore_eos: bool = False

logprobs `class-attribute` `instance-attribute` ¶

logprobs: Optional[int] = None

model `instance-attribute` ¶

model: str

model_name `class-attribute` `instance-attribute` ¶

model_name: Optional[str] = None

multi_modal_content `class-attribute` `instance-attribute` ¶

multi_modal_content: Optional[dict] = None

output_len `instance-attribute` ¶

output_len: int

prompt `instance-attribute` ¶

prompt: str

prompt_len `instance-attribute` ¶

prompt_len: int

init ¶

__init__(
    prompt: str,
    api_url: str,
    prompt_len: int,
    output_len: int,
    model: str,
    model_name: Optional[str] = None,
    best_of: int = 1,
    logprobs: Optional[int] = None,
    extra_body: Optional[dict] = None,
    multi_modal_content: Optional[dict] = None,
    ignore_eos: bool = False,
) -> None

RequestFuncOutput `dataclass` ¶

The output of the request function including metrics.

Source code in vllm/benchmarks/endpoint_request_func.py

@dataclass
class RequestFuncOutput:
    """The output of the request function including metrics."""
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
    itl: list[float] = field(
        default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""

error `class-attribute` `instance-attribute` ¶

error: str = ''

generated_text `class-attribute` `instance-attribute` ¶

generated_text: str = ''

itl `class-attribute` `instance-attribute` ¶

itl: list[float] = field(default_factory=list)

latency `class-attribute` `instance-attribute` ¶

latency: float = 0.0

output_tokens `class-attribute` `instance-attribute` ¶

output_tokens: int = 0

prompt_len `class-attribute` `instance-attribute` ¶

prompt_len: int = 0

success `class-attribute` `instance-attribute` ¶

success: bool = False

tpot `class-attribute` `instance-attribute` ¶

tpot: float = 0.0

ttft `class-attribute` `instance-attribute` ¶

ttft: float = 0.0

init ¶

__init__(
    generated_text: str = "",
    success: bool = False,
    latency: float = 0.0,
    output_tokens: int = 0,
    ttft: float = 0.0,
    itl: list[float] = list(),
    tpot: float = 0.0,
    prompt_len: int = 0,
    error: str = "",
) -> None

async_request_openai_completions `async` ¶

async_request_openai_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput

The async request function for the OpenAI Completions API.

Parameters:

Name	Type	Description	Default
`request_func_input`	`RequestFuncInput`	The input for the request function.	required
`pbar`	`Optional[tqdm]`	The progress bar to display the progress.	`None`

Returns:

Type	Description
`RequestFuncOutput`	The output of the request function.

Source code in vllm/benchmarks/endpoint_request_func.py

async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
    """The async request function for the OpenAI Completions API.

    Args:
        request_func_input: The input for the request function.
        pbar: The progress bar to display the progress.

    Returns:
        The output of the request function.
    """
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
            "stream_options": {
                "include_usage": True,
            },
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk != "[DONE]":
                            data = json.loads(chunk)

                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
                            if choices := data.get("choices"):
                                # Note that text could be empty here
                                # e.g. for special tokens
                                text = choices[0].get("text")
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get(
                                    "completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
                            "This response will be marked as failed!")
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

    if pbar:
        pbar.update(1)
    return output

vllm.benchmarks.endpoint_request_func

AIOHTTP_TIMEOUT module-attribute ¶

ASYNC_REQUEST_FUNCS module-attribute ¶

RequestFuncInput dataclass ¶

api_url instance-attribute ¶

best_of class-attribute instance-attribute ¶

extra_body class-attribute instance-attribute ¶

ignore_eos class-attribute instance-attribute ¶

logprobs class-attribute instance-attribute ¶

model instance-attribute ¶

model_name class-attribute instance-attribute ¶

multi_modal_content class-attribute instance-attribute ¶

output_len instance-attribute ¶

prompt instance-attribute ¶

prompt_len instance-attribute ¶

__init__ ¶

RequestFuncOutput dataclass ¶

error class-attribute instance-attribute ¶

generated_text class-attribute instance-attribute ¶

itl class-attribute instance-attribute ¶

latency class-attribute instance-attribute ¶

output_tokens class-attribute instance-attribute ¶

prompt_len class-attribute instance-attribute ¶

success class-attribute instance-attribute ¶

tpot class-attribute instance-attribute ¶

ttft class-attribute instance-attribute ¶

__init__ ¶

async_request_openai_completions async ¶

AIOHTTP_TIMEOUT `module-attribute` ¶

ASYNC_REQUEST_FUNCS `module-attribute` ¶

RequestFuncInput `dataclass` ¶

api_url `instance-attribute` ¶

best_of `class-attribute` `instance-attribute` ¶

extra_body `class-attribute` `instance-attribute` ¶

ignore_eos `class-attribute` `instance-attribute` ¶

logprobs `class-attribute` `instance-attribute` ¶

model `instance-attribute` ¶

model_name `class-attribute` `instance-attribute` ¶

multi_modal_content `class-attribute` `instance-attribute` ¶

output_len `instance-attribute` ¶

prompt `instance-attribute` ¶

prompt_len `instance-attribute` ¶

init ¶

RequestFuncOutput `dataclass` ¶

error `class-attribute` `instance-attribute` ¶

generated_text `class-attribute` `instance-attribute` ¶

itl `class-attribute` `instance-attribute` ¶

latency `class-attribute` `instance-attribute` ¶

output_tokens `class-attribute` `instance-attribute` ¶

prompt_len `class-attribute` `instance-attribute` ¶

success `class-attribute` `instance-attribute` ¶

tpot `class-attribute` `instance-attribute` ¶

ttft `class-attribute` `instance-attribute` ¶

init ¶

async_request_openai_completions `async` ¶