Offline Inference Vision Language#
Source vllm-project/vllm.
1"""
2This example shows how to use vLLM for running offline inference
3with the correct prompt format on vision language models.
4
5For most models, the prompt format should follow corresponding examples
6on HuggingFace model repository.
7"""
8from transformers import AutoTokenizer
9
10from vllm import LLM, SamplingParams
11from vllm.assets.image import ImageAsset
12from vllm.assets.video import VideoAsset
13from vllm.utils import FlexibleArgumentParser
14
15# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
16# lower-end GPUs.
17# Unless specified, these settings have been tested to work on a single L4.
18
19
20# LLaVA-1.5
21def run_llava(question: str, modality: str):
22 assert modality == "image"
23
24 prompt = f"USER: <image>\n{question}\nASSISTANT:"
25
26 llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
27 stop_token_ids = None
28 return llm, prompt, stop_token_ids
29
30
31# LLaVA-1.6/LLaVA-NeXT
32def run_llava_next(question: str, modality: str):
33 assert modality == "image"
34
35 prompt = f"[INST] <image>\n{question} [/INST]"
36 llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
37 stop_token_ids = None
38 return llm, prompt, stop_token_ids
39
40
41# LlaVA-NeXT-Video
42# Currently only support for video input
43def run_llava_next_video(question: str, modality: str):
44 assert modality == "video"
45
46 prompt = f"USER: <video>\n{question} ASSISTANT:"
47 llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
48 stop_token_ids = None
49 return llm, prompt, stop_token_ids
50
51
52# LLaVA-OneVision
53def run_llava_onevision(question: str, modality: str):
54
55 if modality == "video":
56 prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
57 <|im_start|>assistant\n"
58
59 elif modality == "image":
60 prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
61 <|im_start|>assistant\n"
62
63 llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
64 max_model_len=16384)
65 stop_token_ids = None
66 return llm, prompt, stop_token_ids
67
68
69# Fuyu
70def run_fuyu(question: str, modality: str):
71 assert modality == "image"
72
73 prompt = f"{question}\n"
74 llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
75 stop_token_ids = None
76 return llm, prompt, stop_token_ids
77
78
79# Phi-3-Vision
80def run_phi3v(question: str, modality: str):
81 assert modality == "image"
82
83 prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
84 # Note: The default setting of max_num_seqs (256) and
85 # max_model_len (128k) for this model may cause OOM.
86 # You may lower either to run this example on lower-end GPUs.
87
88 # In this example, we override max_num_seqs to 5 while
89 # keeping the original context length of 128k.
90
91 # num_crops is an override kwarg to the multimodal image processor;
92 # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
93 # to use 16 for single frame scenarios, and 4 for multi-frame.
94 #
95 # Generally speaking, a larger value for num_crops results in more
96 # tokens per image instance, because it may scale the image more in
97 # the image preprocessing. Some references in the model docs and the
98 # formula for image tokens after the preprocessing
99 # transform can be found below.
100 #
101 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
102 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
103 llm = LLM(
104 model="microsoft/Phi-3-vision-128k-instruct",
105 trust_remote_code=True,
106 max_model_len=4096,
107 max_num_seqs=2,
108 # Note - mm_processor_kwargs can also be passed to generate/chat calls
109 mm_processor_kwargs={"num_crops": 16},
110 )
111 stop_token_ids = None
112 return llm, prompt, stop_token_ids
113
114
115# PaliGemma
116def run_paligemma(question: str, modality: str):
117 assert modality == "image"
118
119 # PaliGemma has special prompt format for VQA
120 prompt = "caption en"
121 llm = LLM(model="google/paligemma-3b-mix-224")
122 stop_token_ids = None
123 return llm, prompt, stop_token_ids
124
125
126# Chameleon
127def run_chameleon(question: str, modality: str):
128 assert modality == "image"
129
130 prompt = f"{question}<image>"
131 llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
132 stop_token_ids = None
133 return llm, prompt, stop_token_ids
134
135
136# MiniCPM-V
137def run_minicpmv(question: str, modality: str):
138 assert modality == "image"
139
140 # 2.0
141 # The official repo doesn't work yet, so we need to use a fork for now
142 # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
143 # model_name = "HwwwH/MiniCPM-V-2"
144
145 # 2.5
146 # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
147
148 #2.6
149 model_name = "openbmb/MiniCPM-V-2_6"
150 tokenizer = AutoTokenizer.from_pretrained(model_name,
151 trust_remote_code=True)
152 llm = LLM(
153 model=model_name,
154 max_model_len=4096,
155 max_num_seqs=2,
156 trust_remote_code=True,
157 )
158 # NOTE The stop_token_ids are different for various versions of MiniCPM-V
159 # 2.0
160 # stop_token_ids = [tokenizer.eos_id]
161
162 # 2.5
163 # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
164
165 # 2.6
166 stop_tokens = ['<|im_end|>', '<|endoftext|>']
167 stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
168
169 messages = [{
170 'role': 'user',
171 'content': f'(<image>./</image>)\n{question}'
172 }]
173 prompt = tokenizer.apply_chat_template(messages,
174 tokenize=False,
175 add_generation_prompt=True)
176 return llm, prompt, stop_token_ids
177
178
179# InternVL
180def run_internvl(question: str, modality: str):
181 assert modality == "image"
182
183 model_name = "OpenGVLab/InternVL2-2B"
184
185 llm = LLM(
186 model=model_name,
187 trust_remote_code=True,
188 max_model_len=4096,
189 )
190
191 tokenizer = AutoTokenizer.from_pretrained(model_name,
192 trust_remote_code=True)
193 messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
194 prompt = tokenizer.apply_chat_template(messages,
195 tokenize=False,
196 add_generation_prompt=True)
197
198 # Stop tokens for InternVL
199 # models variants may have different stop tokens
200 # please refer to the model card for the correct "stop words":
201 # https://huggingface.co/OpenGVLab/InternVL2-2B#service
202 stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
203 stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
204 return llm, prompt, stop_token_ids
205
206
207# NVLM-D
208def run_nvlm_d(question: str, modality: str):
209 assert modality == "image"
210
211 model_name = "nvidia/NVLM-D-72B"
212
213 # Adjust this as necessary to fit in GPU
214 llm = LLM(
215 model=model_name,
216 trust_remote_code=True,
217 max_model_len=4096,
218 tensor_parallel_size=4,
219 )
220
221 tokenizer = AutoTokenizer.from_pretrained(model_name,
222 trust_remote_code=True)
223 messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
224 prompt = tokenizer.apply_chat_template(messages,
225 tokenize=False,
226 add_generation_prompt=True)
227 stop_token_ids = None
228 return llm, prompt, stop_token_ids
229
230
231# BLIP-2
232def run_blip2(question: str, modality: str):
233 assert modality == "image"
234
235 # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
236 # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
237 prompt = f"Question: {question} Answer:"
238 llm = LLM(model="Salesforce/blip2-opt-2.7b")
239 stop_token_ids = None
240 return llm, prompt, stop_token_ids
241
242
243# Qwen
244def run_qwen_vl(question: str, modality: str):
245 assert modality == "image"
246
247 llm = LLM(
248 model="Qwen/Qwen-VL",
249 trust_remote_code=True,
250 max_model_len=1024,
251 max_num_seqs=2,
252 )
253
254 prompt = f"{question}Picture 1: <img></img>\n"
255 stop_token_ids = None
256 return llm, prompt, stop_token_ids
257
258
259# Qwen2-VL
260def run_qwen2_vl(question: str, modality: str):
261 assert modality == "image"
262
263 model_name = "Qwen/Qwen2-VL-7B-Instruct"
264
265 # Tested on L40
266 llm = LLM(
267 model=model_name,
268 max_model_len=8192,
269 max_num_seqs=5,
270 )
271
272 prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
273 "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
274 f"{question}<|im_end|>\n"
275 "<|im_start|>assistant\n")
276 stop_token_ids = None
277 return llm, prompt, stop_token_ids
278
279
280# LLama 3.2
281def run_mllama(question: str, modality: str):
282 assert modality == "image"
283
284 model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
285
286 # Note: The default setting of max_num_seqs (256) and
287 # max_model_len (131072) for this model may cause OOM.
288 # You may lower either to run this example on lower-end GPUs.
289
290 # The configuration below has been confirmed to launch on a single L40 GPU.
291 llm = LLM(
292 model=model_name,
293 max_model_len=4096,
294 max_num_seqs=16,
295 enforce_eager=True,
296 )
297
298 prompt = f"<|image|><|begin_of_text|>{question}"
299 stop_token_ids = None
300 return llm, prompt, stop_token_ids
301
302
303# Molmo
304def run_molmo(question, modality):
305 assert modality == "image"
306
307 model_name = "allenai/Molmo-7B-D-0924"
308
309 llm = LLM(
310 model=model_name,
311 trust_remote_code=True,
312 dtype="bfloat16",
313 )
314
315 prompt = question
316 stop_token_ids = None
317 return llm, prompt, stop_token_ids
318
319
320# GLM-4v
321def run_glm4v(question: str, modality: str):
322 assert modality == "image"
323 model_name = "THUDM/glm-4v-9b"
324
325 llm = LLM(model=model_name,
326 max_model_len=2048,
327 max_num_seqs=2,
328 trust_remote_code=True,
329 enforce_eager=True)
330 prompt = question
331 stop_token_ids = [151329, 151336, 151338]
332 return llm, prompt, stop_token_ids
333
334
335model_example_map = {
336 "llava": run_llava,
337 "llava-next": run_llava_next,
338 "llava-next-video": run_llava_next_video,
339 "llava-onevision": run_llava_onevision,
340 "fuyu": run_fuyu,
341 "phi3_v": run_phi3v,
342 "paligemma": run_paligemma,
343 "chameleon": run_chameleon,
344 "minicpmv": run_minicpmv,
345 "blip-2": run_blip2,
346 "internvl_chat": run_internvl,
347 "NVLM_D": run_nvlm_d,
348 "qwen_vl": run_qwen_vl,
349 "qwen2_vl": run_qwen2_vl,
350 "mllama": run_mllama,
351 "molmo": run_molmo,
352 "glm4v": run_glm4v,
353}
354
355
356def get_multi_modal_input(args):
357 """
358 return {
359 "data": image or video,
360 "question": question,
361 }
362 """
363 if args.modality == "image":
364 # Input image and question
365 image = ImageAsset("cherry_blossom") \
366 .pil_image.convert("RGB")
367 img_question = "What is the content of this image?"
368
369 return {
370 "data": image,
371 "question": img_question,
372 }
373
374 if args.modality == "video":
375 # Input video and question
376 video = VideoAsset(name="sample_demo_1.mp4",
377 num_frames=args.num_frames).np_ndarrays
378 vid_question = "Why is this video funny?"
379
380 return {
381 "data": video,
382 "question": vid_question,
383 }
384
385 msg = f"Modality {args.modality} is not supported."
386 raise ValueError(msg)
387
388
389def main(args):
390 model = args.model_type
391 if model not in model_example_map:
392 raise ValueError(f"Model type {model} is not supported.")
393
394 modality = args.modality
395 mm_input = get_multi_modal_input(args)
396 data = mm_input["data"]
397 question = mm_input["question"]
398
399 llm, prompt, stop_token_ids = model_example_map[model](question, modality)
400
401 # We set temperature to 0.2 so that outputs can be different
402 # even when all prompts are identical when running batch inference.
403 sampling_params = SamplingParams(temperature=0.2,
404 max_tokens=64,
405 stop_token_ids=stop_token_ids)
406
407 assert args.num_prompts > 0
408 if args.num_prompts == 1:
409 # Single inference
410 inputs = {
411 "prompt": prompt,
412 "multi_modal_data": {
413 modality: data
414 },
415 }
416
417 else:
418 # Batch inference
419 inputs = [{
420 "prompt": prompt,
421 "multi_modal_data": {
422 modality: data
423 },
424 } for _ in range(args.num_prompts)]
425
426 outputs = llm.generate(inputs, sampling_params=sampling_params)
427
428 for o in outputs:
429 generated_text = o.outputs[0].text
430 print(generated_text)
431
432
433if __name__ == "__main__":
434 parser = FlexibleArgumentParser(
435 description='Demo on using vLLM for offline inference with '
436 'vision language models')
437 parser.add_argument('--model-type',
438 '-m',
439 type=str,
440 default="llava",
441 choices=model_example_map.keys(),
442 help='Huggingface "model_type".')
443 parser.add_argument('--num-prompts',
444 type=int,
445 default=4,
446 help='Number of prompts to run.')
447 parser.add_argument('--modality',
448 type=str,
449 default="image",
450 choices=['image', 'video'],
451 help='Modality of the input.')
452 parser.add_argument('--num-frames',
453 type=int,
454 default=16,
455 help='Number of frames to extract from the video.')
456 args = parser.parse_args()
457 main(args)