Offline Inference With Profiler

Offline Inference With Profiler#

Source: examples/offline_inference_with_profiler.py.

 1import os
 2import time
 3
 4from vllm import LLM, SamplingParams
 5
 6# enable torch profiler, can also be set on cmd line
 7os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
 8
 9# Sample prompts.
10prompts = [
11    "Hello, my name is",
12    "The president of the United States is",
13    "The capital of France is",
14    "The future of AI is",
15]
16# Create a sampling params object.
17sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
18
19if __name__ == "__main__":
20
21    # Create an LLM.
22    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
23
24    llm.start_profile()
25
26    # Generate texts from the prompts. The output is a list of RequestOutput
27    # objects that contain the prompt, generated text, and other information.
28    outputs = llm.generate(prompts, sampling_params)
29
30    llm.stop_profile()
31
32    # Print the outputs.
33    for output in outputs:
34        prompt = output.prompt
35        generated_text = output.outputs[0].text
36        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
37
38    # Add a buffer to wait for profiler in the background process
39    # (in case MP is on) to finish writing profiling output.
40    time.sleep(10)