Gguf Inference#
Source vllm-project/vllm.
1from huggingface_hub import hf_hub_download
2
3from vllm import LLM, SamplingParams
4
5
6def run_gguf_inference(model_path):
7 PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n" # noqa: E501
8 system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501
9 # Sample prompts.
10 prompts = [
11 "How many helicopters can a human eat in one sitting?",
12 "What's the future of AI?",
13 ]
14 prompts = [
15 PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
16 for prompt in prompts
17 ]
18 # Create a sampling params object.
19 sampling_params = SamplingParams(temperature=0, max_tokens=128)
20
21 # Create an LLM.
22 llm = LLM(model=model_path,
23 tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
24 gpu_memory_utilization=0.95)
25
26 outputs = llm.generate(prompts, sampling_params)
27 # Print the outputs.
28 for output in outputs:
29 prompt = output.prompt
30 generated_text = output.outputs[0].text
31 print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
32
33
34if __name__ == "__main__":
35 repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
36 filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
37 model = hf_hub_download(repo_id, filename=filename)
38 run_gguf_inference(model)