Offline Inference Chat#

Source vllm-project/vllm.

 1from vllm import LLM, SamplingParams
 2
 3llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 4sampling_params = SamplingParams(temperature=0.5)
 5
 6
 7def print_outputs(outputs):
 8    for output in outputs:
 9        prompt = output.prompt
10        generated_text = output.outputs[0].text
11        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
12    print("-" * 80)
13
14
15print("=" * 80)
16
17# In this script, we demonstrate how to pass input to the chat method:
18
19conversation = [
20    {
21        "role": "system",
22        "content": "You are a helpful assistant"
23    },
24    {
25        "role": "user",
26        "content": "Hello"
27    },
28    {
29        "role": "assistant",
30        "content": "Hello! How can I assist you today?"
31    },
32    {
33        "role": "user",
34        "content": "Write an essay about the importance of higher education.",
35    },
36]
37outputs = llm.chat(conversation,
38                   sampling_params=sampling_params,
39                   use_tqdm=False)
40print_outputs(outputs)
41
42# A chat template can be optionally supplied.
43# If not, the model will use its default chat template.
44
45# with open('template_falcon_180b.jinja', "r") as f:
46#     chat_template = f.read()
47
48# outputs = llm.chat(
49#     conversations,
50#     sampling_params=sampling_params,
51#     use_tqdm=False,
52#     chat_template=chat_template,
53# )