Gradio Webserver#

Source vllm-project/vllm.

 1import argparse
 2import json
 3
 4import gradio as gr
 5import requests
 6
 7
 8def http_bot(prompt):
 9    headers = {"User-Agent": "vLLM Client"}
10    pload = {
11        "prompt": prompt,
12        "stream": True,
13        "max_tokens": 128,
14    }
15    response = requests.post(args.model_url,
16                             headers=headers,
17                             json=pload,
18                             stream=True)
19
20    for chunk in response.iter_lines(chunk_size=8192,
21                                     decode_unicode=False,
22                                     delimiter=b"\0"):
23        if chunk:
24            data = json.loads(chunk.decode("utf-8"))
25            output = data["text"][0]
26            yield output
27
28
29def build_demo():
30    with gr.Blocks() as demo:
31        gr.Markdown("# vLLM text completion demo\n")
32        inputbox = gr.Textbox(label="Input",
33                              placeholder="Enter text and press ENTER")
34        outputbox = gr.Textbox(label="Output",
35                               placeholder="Generated result from the model")
36        inputbox.submit(http_bot, [inputbox], [outputbox])
37    return demo
38
39
40if __name__ == "__main__":
41    parser = argparse.ArgumentParser()
42    parser.add_argument("--host", type=str, default=None)
43    parser.add_argument("--port", type=int, default=8001)
44    parser.add_argument("--model-url",
45                        type=str,
46                        default="http://localhost:8000/generate")
47    args = parser.parse_args()
48
49    demo = build_demo()
50    demo.queue().launch(server_name=args.host,
51                        server_port=args.port,
52                        share=True)