OpenAI Vision API Client#
Source vllm-project/vllm.
1"""An example showing how to use vLLM to serve VLMs.
2
3Launch the vLLM server with the following command:
4
5(single image inference with Llava)
6vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
7
8(multi-image inference with Phi-3.5-vision-instruct)
9vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
10 --trust-remote-code --limit-mm-per-prompt image=2
11"""
12import base64
13
14import requests
15from openai import OpenAI
16
17# Modify OpenAI's API key and API base to use vLLM's API server.
18openai_api_key = "EMPTY"
19openai_api_base = "http://localhost:8000/v1"
20
21client = OpenAI(
22 # defaults to os.environ.get("OPENAI_API_KEY")
23 api_key=openai_api_key,
24 base_url=openai_api_base,
25)
26
27models = client.models.list()
28model = models.data[0].id
29
30image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
31
32# Use image url in the payload
33chat_completion_from_url = client.chat.completions.create(
34 messages=[{
35 "role":
36 "user",
37 "content": [
38 {
39 "type": "text",
40 "text": "What’s in this image?"
41 },
42 {
43 "type": "image_url",
44 "image_url": {
45 "url": image_url
46 },
47 },
48 ],
49 }],
50 model=model,
51 max_tokens=64,
52)
53
54result = chat_completion_from_url.choices[0].message.content
55print(f"Chat completion output:{result}")
56
57
58# Use base64 encoded image in the payload
59def encode_image_base64_from_url(image_url: str) -> str:
60 """Encode an image retrieved from a remote url to base64 format."""
61
62 with requests.get(image_url) as response:
63 response.raise_for_status()
64 result = base64.b64encode(response.content).decode('utf-8')
65
66 return result
67
68
69image_base64 = encode_image_base64_from_url(image_url=image_url)
70chat_completion_from_base64 = client.chat.completions.create(
71 messages=[{
72 "role":
73 "user",
74 "content": [
75 {
76 "type": "text",
77 "text": "What’s in this image?"
78 },
79 {
80 "type": "image_url",
81 "image_url": {
82 "url": f"data:image/jpeg;base64,{image_base64}"
83 },
84 },
85 ],
86 }],
87 model=model,
88 max_tokens=64,
89)
90
91result = chat_completion_from_base64.choices[0].message.content
92print(f"Chat completion output:{result}")
93
94# Multi-image input inference
95image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
96image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
97chat_completion_from_url = client.chat.completions.create(
98 messages=[{
99 "role":
100 "user",
101 "content": [
102 {
103 "type": "text",
104 "text": "What are the animals in these images?"
105 },
106 {
107 "type": "image_url",
108 "image_url": {
109 "url": image_url_duck
110 },
111 },
112 {
113 "type": "image_url",
114 "image_url": {
115 "url": image_url_lion
116 },
117 },
118 ],
119 }],
120 model=model,
121 max_tokens=64,
122)
123
124result = chat_completion_from_url.choices[0].message.content
125print(f"Chat completion output:{result}")