Gguf Inference#

Source vllm-project/vllm.

 1from huggingface_hub import hf_hub_download
 2
 3from vllm import LLM, SamplingParams
 4
 5
 6def run_gguf_inference(model_path):
 7    PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"  # noqa: E501
 8    system_message = "You are a friendly chatbot who always responds in the style of a pirate."  # noqa: E501
 9    # Sample prompts.
10    prompts = [
11        "How many helicopters can a human eat in one sitting?",
12        "What's the future of AI?",
13    ]
14    prompts = [
15        PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
16        for prompt in prompts
17    ]
18    # Create a sampling params object.
19    sampling_params = SamplingParams(temperature=0, max_tokens=128)
20
21    # Create an LLM.
22    llm = LLM(model=model_path,
23              tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
24              gpu_memory_utilization=0.95)
25
26    outputs = llm.generate(prompts, sampling_params)
27    # Print the outputs.
28    for output in outputs:
29        prompt = output.prompt
30        generated_text = output.outputs[0].text
31        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
32
33
34if __name__ == "__main__":
35    repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
36    filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
37    model = hf_hub_download(repo_id, filename=filename)
38    run_gguf_inference(model)