OpenAI Chat Completion Client For Multimodal#
Source vllm-project/vllm.
1"""An example showing how to use vLLM to serve multimodal models
2and run online inference with OpenAI client.
3
4Launch the vLLM server with the following command:
5
6(single image inference with Llava)
7vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
8
9(multi-image inference with Phi-3.5-vision-instruct)
10vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
11 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
12
13(audio inference with Ultravox)
14vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
15"""
16import base64
17
18import requests
19from openai import OpenAI
20
21from vllm.assets.audio import AudioAsset
22from vllm.utils import FlexibleArgumentParser
23
24# Modify OpenAI's API key and API base to use vLLM's API server.
25openai_api_key = "EMPTY"
26openai_api_base = "http://localhost:8000/v1"
27
28client = OpenAI(
29 # defaults to os.environ.get("OPENAI_API_KEY")
30 api_key=openai_api_key,
31 base_url=openai_api_base,
32)
33
34models = client.models.list()
35model = models.data[0].id
36
37
38def encode_base64_content_from_url(content_url: str) -> str:
39 """Encode a content retrieved from a remote url to base64 format."""
40
41 with requests.get(content_url) as response:
42 response.raise_for_status()
43 result = base64.b64encode(response.content).decode('utf-8')
44
45 return result
46
47
48# Text-only inference
49def run_text_only() -> None:
50 chat_completion = client.chat.completions.create(
51 messages=[{
52 "role": "user",
53 "content": "What's the capital of France?"
54 }],
55 model=model,
56 max_completion_tokens=64,
57 )
58
59 result = chat_completion.choices[0].message.content
60 print("Chat completion output:", result)
61
62
63# Single-image input inference
64def run_single_image() -> None:
65
66 ## Use image url in the payload
67 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
68 chat_completion_from_url = client.chat.completions.create(
69 messages=[{
70 "role":
71 "user",
72 "content": [
73 {
74 "type": "text",
75 "text": "What's in this image?"
76 },
77 {
78 "type": "image_url",
79 "image_url": {
80 "url": image_url
81 },
82 },
83 ],
84 }],
85 model=model,
86 max_completion_tokens=64,
87 )
88
89 result = chat_completion_from_url.choices[0].message.content
90 print("Chat completion output from image url:", result)
91
92 ## Use base64 encoded image in the payload
93 image_base64 = encode_base64_content_from_url(image_url)
94 chat_completion_from_base64 = client.chat.completions.create(
95 messages=[{
96 "role":
97 "user",
98 "content": [
99 {
100 "type": "text",
101 "text": "What's in this image?"
102 },
103 {
104 "type": "image_url",
105 "image_url": {
106 "url": f"data:image/jpeg;base64,{image_base64}"
107 },
108 },
109 ],
110 }],
111 model=model,
112 max_completion_tokens=64,
113 )
114
115 result = chat_completion_from_base64.choices[0].message.content
116 print("Chat completion output from base64 encoded image:", result)
117
118
119# Multi-image input inference
120def run_multi_image() -> None:
121 image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
122 image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
123 chat_completion_from_url = client.chat.completions.create(
124 messages=[{
125 "role":
126 "user",
127 "content": [
128 {
129 "type": "text",
130 "text": "What are the animals in these images?"
131 },
132 {
133 "type": "image_url",
134 "image_url": {
135 "url": image_url_duck
136 },
137 },
138 {
139 "type": "image_url",
140 "image_url": {
141 "url": image_url_lion
142 },
143 },
144 ],
145 }],
146 model=model,
147 max_completion_tokens=64,
148 )
149
150 result = chat_completion_from_url.choices[0].message.content
151 print("Chat completion output:", result)
152
153
154# Audio input inference
155def run_audio() -> None:
156 # Any format supported by librosa is supported
157 audio_url = AudioAsset("winning_call").url
158
159 # Use audio url in the payload
160 chat_completion_from_url = client.chat.completions.create(
161 messages=[{
162 "role":
163 "user",
164 "content": [
165 {
166 "type": "text",
167 "text": "What's in this audio?"
168 },
169 {
170 "type": "audio_url",
171 "audio_url": {
172 "url": audio_url
173 },
174 },
175 ],
176 }],
177 model=model,
178 max_completion_tokens=64,
179 )
180
181 result = chat_completion_from_url.choices[0].message.content
182 print("Chat completion output from audio url:", result)
183
184 audio_base64 = encode_base64_content_from_url(audio_url)
185 chat_completion_from_base64 = client.chat.completions.create(
186 messages=[{
187 "role":
188 "user",
189 "content": [
190 {
191 "type": "text",
192 "text": "What's in this audio?"
193 },
194 {
195 "type": "audio_url",
196 "audio_url": {
197 # Any format supported by librosa is supported
198 "url": f"data:audio/ogg;base64,{audio_base64}"
199 },
200 },
201 ],
202 }],
203 model=model,
204 max_completion_tokens=64,
205 )
206
207 result = chat_completion_from_base64.choices[0].message.content
208 print("Chat completion output from base64 encoded audio:", result)
209
210
211example_function_map = {
212 "text-only": run_text_only,
213 "single-image": run_single_image,
214 "multi-image": run_multi_image,
215 "audio": run_audio,
216}
217
218
219def main(args) -> None:
220 chat_type = args.chat_type
221 example_function_map[chat_type]()
222
223
224if __name__ == "__main__":
225 parser = FlexibleArgumentParser(
226 description='Demo on using OpenAI client for online inference with '
227 'multimodal language models served with vLLM.')
228 parser.add_argument(
229 '--chat-type',
230 '-c',
231 type=str,
232 default="single-image",
233 choices=["text-only", "single-image", "multi-image", "audio"],
234 help='Conversation type with multimodal data.')
235 args = parser.parse_args()
236 main(args)