OpenAI Chat Completion Client For Multimodal#
Source vllm-project/vllm.
1"""An example showing how to use vLLM to serve multimodal models
2and run online inference with OpenAI client.
3
4Launch the vLLM server with the following command:
5
6(single image inference with Llava)
7vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
8
9(multi-image inference with Phi-3.5-vision-instruct)
10vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
11 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
12
13(audio inference with Ultravox)
14vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
15"""
16import base64
17
18import requests
19from openai import OpenAI
20
21from vllm.assets.audio import AudioAsset
22from vllm.utils import FlexibleArgumentParser
23
24# Modify OpenAI's API key and API base to use vLLM's API server.
25openai_api_key = "EMPTY"
26openai_api_base = "http://localhost:8000/v1"
27
28client = OpenAI(
29 # defaults to os.environ.get("OPENAI_API_KEY")
30 api_key=openai_api_key,
31 base_url=openai_api_base,
32)
33
34models = client.models.list()
35model = models.data[0].id
36
37
38def encode_base64_content_from_url(content_url: str) -> str:
39 """Encode a content retrieved from a remote url to base64 format."""
40
41 with requests.get(content_url) as response:
42 response.raise_for_status()
43 result = base64.b64encode(response.content).decode('utf-8')
44
45 return result
46
47
48# Text-only inference
49def run_text_only() -> None:
50 chat_completion = client.chat.completions.create(
51 messages=[{
52 "role": "user",
53 "content": "What's the capital of France?"
54 }],
55 model=model,
56 max_completion_tokens=64,
57 )
58
59 result = chat_completion.choices[0].message.content
60 print("Chat completion output:", result)
61
62
63# Single-image input inference
64def run_single_image() -> None:
65
66 ## Use image url in the payload
67 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
68 chat_completion_from_url = client.chat.completions.create(
69 messages=[{
70 "role":
71 "user",
72 "content": [
73 {
74 "type": "text",
75 "text": "What's in this image?"
76 },
77 {
78 "type": "image_url",
79 "image_url": {
80 "url": image_url
81 },
82 },
83 ],
84 }],
85 model=model,
86 max_completion_tokens=64,
87 )
88
89 result = chat_completion_from_url.choices[0].message.content
90 print("Chat completion output from image url:", result)
91
92 ## Use base64 encoded image in the payload
93 image_base64 = encode_base64_content_from_url(image_url)
94 chat_completion_from_base64 = client.chat.completions.create(
95 messages=[{
96 "role":
97 "user",
98 "content": [
99 {
100 "type": "text",
101 "text": "What's in this image?"
102 },
103 {
104 "type": "image_url",
105 "image_url": {
106 "url": f"data:image/jpeg;base64,{image_base64}"
107 },
108 },
109 ],
110 }],
111 model=model,
112 max_completion_tokens=64,
113 )
114
115 result = chat_completion_from_base64.choices[0].message.content
116 print("Chat completion output from base64 encoded image:", result)
117
118
119# Multi-image input inference
120def run_multi_image() -> None:
121 image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
122 image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
123 chat_completion_from_url = client.chat.completions.create(
124 messages=[{
125 "role":
126 "user",
127 "content": [
128 {
129 "type": "text",
130 "text": "What are the animals in these images?"
131 },
132 {
133 "type": "image_url",
134 "image_url": {
135 "url": image_url_duck
136 },
137 },
138 {
139 "type": "image_url",
140 "image_url": {
141 "url": image_url_lion
142 },
143 },
144 ],
145 }],
146 model=model,
147 max_completion_tokens=64,
148 )
149
150 result = chat_completion_from_url.choices[0].message.content
151 print("Chat completion output:", result)
152
153
154# Audio input inference
155def run_audio() -> None:
156 audio_url = AudioAsset("winning_call").url
157 audio_base64 = encode_base64_content_from_url(audio_url)
158
159 # OpenAI-compatible schema (`input_audio`)
160 chat_completion_from_base64 = client.chat.completions.create(
161 messages=[{
162 "role":
163 "user",
164 "content": [
165 {
166 "type": "text",
167 "text": "What's in this audio?"
168 },
169 {
170 "type": "input_audio",
171 "input_audio": {
172 # Any format supported by librosa is supported
173 "data": audio_base64,
174 "format": "wav"
175 },
176 },
177 ],
178 }],
179 model=model,
180 max_completion_tokens=64,
181 )
182
183 result = chat_completion_from_base64.choices[0].message.content
184 print("Chat completion output from input audio:", result)
185
186 # HTTP URL
187 chat_completion_from_url = client.chat.completions.create(
188 messages=[{
189 "role":
190 "user",
191 "content": [
192 {
193 "type": "text",
194 "text": "What's in this audio?"
195 },
196 {
197 "type": "audio_url",
198 "audio_url": {
199 # Any format supported by librosa is supported
200 "url": audio_url
201 },
202 },
203 ],
204 }],
205 model=model,
206 max_completion_tokens=64,
207 )
208
209 result = chat_completion_from_url.choices[0].message.content
210 print("Chat completion output from audio url:", result)
211
212 # base64 URL
213 chat_completion_from_base64 = client.chat.completions.create(
214 messages=[{
215 "role":
216 "user",
217 "content": [
218 {
219 "type": "text",
220 "text": "What's in this audio?"
221 },
222 {
223 "type": "audio_url",
224 "audio_url": {
225 # Any format supported by librosa is supported
226 "url": f"data:audio/ogg;base64,{audio_base64}"
227 },
228 },
229 ],
230 }],
231 model=model,
232 max_completion_tokens=64,
233 )
234
235 result = chat_completion_from_base64.choices[0].message.content
236 print("Chat completion output from base64 encoded audio:", result)
237
238
239example_function_map = {
240 "text-only": run_text_only,
241 "single-image": run_single_image,
242 "multi-image": run_multi_image,
243 "audio": run_audio,
244}
245
246
247def main(args) -> None:
248 chat_type = args.chat_type
249 example_function_map[chat_type]()
250
251
252if __name__ == "__main__":
253 parser = FlexibleArgumentParser(
254 description='Demo on using OpenAI client for online inference with '
255 'multimodal language models served with vLLM.')
256 parser.add_argument(
257 '--chat-type',
258 '-c',
259 type=str,
260 default="single-image",
261 choices=["text-only", "single-image", "multi-image", "audio"],
262 help='Conversation type with multimodal data.')
263 args = parser.parse_args()
264 main(args)