Offline Inference Chat#
Source vllm-project/vllm.
1from vllm import LLM, SamplingParams
2
3llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
4sampling_params = SamplingParams(temperature=0.5)
5
6
7def print_outputs(outputs):
8 for output in outputs:
9 prompt = output.prompt
10 generated_text = output.outputs[0].text
11 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
12 print("-" * 80)
13
14
15print("=" * 80)
16
17# In this script, we demonstrate how to pass input to the chat method:
18
19conversation = [
20 {
21 "role": "system",
22 "content": "You are a helpful assistant"
23 },
24 {
25 "role": "user",
26 "content": "Hello"
27 },
28 {
29 "role": "assistant",
30 "content": "Hello! How can I assist you today?"
31 },
32 {
33 "role": "user",
34 "content": "Write an essay about the importance of higher education.",
35 },
36]
37outputs = llm.chat(conversation,
38 sampling_params=sampling_params,
39 use_tqdm=False)
40print_outputs(outputs)
41
42# A chat template can be optionally supplied.
43# If not, the model will use its default chat template.
44
45# with open('template_falcon_180b.jinja', "r") as f:
46# chat_template = f.read()
47
48# outputs = llm.chat(
49# conversations,
50# sampling_params=sampling_params,
51# use_tqdm=False,
52# chat_template=chat_template,
53# )