Offline Inference Vision Language#
Source vllm-project/vllm.
1"""
2This example shows how to use vLLM for running offline inference with
3the correct prompt format on vision language models for text generation.
4
5For most models, the prompt format should follow corresponding examples
6on HuggingFace model repository.
7"""
8from transformers import AutoTokenizer
9
10from vllm import LLM, SamplingParams
11from vllm.assets.image import ImageAsset
12from vllm.assets.video import VideoAsset
13from vllm.utils import FlexibleArgumentParser
14
15# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
16# lower-end GPUs.
17# Unless specified, these settings have been tested to work on a single L4.
18
19
20# LLaVA-1.5
21def run_llava(question: str, modality: str):
22 assert modality == "image"
23
24 prompt = f"USER: <image>\n{question}\nASSISTANT:"
25
26 llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
27 stop_token_ids = None
28 return llm, prompt, stop_token_ids
29
30
31# LLaVA-1.6/LLaVA-NeXT
32def run_llava_next(question: str, modality: str):
33 assert modality == "image"
34
35 prompt = f"[INST] <image>\n{question} [/INST]"
36 llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
37 stop_token_ids = None
38 return llm, prompt, stop_token_ids
39
40
41# LlaVA-NeXT-Video
42# Currently only support for video input
43def run_llava_next_video(question: str, modality: str):
44 assert modality == "video"
45
46 prompt = f"USER: <video>\n{question} ASSISTANT:"
47 llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
48 stop_token_ids = None
49 return llm, prompt, stop_token_ids
50
51
52# LLaVA-OneVision
53def run_llava_onevision(question: str, modality: str):
54
55 if modality == "video":
56 prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
57 <|im_start|>assistant\n"
58
59 elif modality == "image":
60 prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
61 <|im_start|>assistant\n"
62
63 llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
64 max_model_len=16384)
65 stop_token_ids = None
66 return llm, prompt, stop_token_ids
67
68
69# Fuyu
70def run_fuyu(question: str, modality: str):
71 assert modality == "image"
72
73 prompt = f"{question}\n"
74 llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
75 stop_token_ids = None
76 return llm, prompt, stop_token_ids
77
78
79# Phi-3-Vision
80def run_phi3v(question: str, modality: str):
81 assert modality == "image"
82
83 prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
84 # Note: The default setting of max_num_seqs (256) and
85 # max_model_len (128k) for this model may cause OOM.
86 # You may lower either to run this example on lower-end GPUs.
87
88 # In this example, we override max_num_seqs to 5 while
89 # keeping the original context length of 128k.
90
91 # num_crops is an override kwarg to the multimodal image processor;
92 # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
93 # to use 16 for single frame scenarios, and 4 for multi-frame.
94 #
95 # Generally speaking, a larger value for num_crops results in more
96 # tokens per image instance, because it may scale the image more in
97 # the image preprocessing. Some references in the model docs and the
98 # formula for image tokens after the preprocessing
99 # transform can be found below.
100 #
101 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
102 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
103 llm = LLM(
104 model="microsoft/Phi-3-vision-128k-instruct",
105 trust_remote_code=True,
106 max_model_len=4096,
107 max_num_seqs=2,
108 # Note - mm_processor_kwargs can also be passed to generate/chat calls
109 mm_processor_kwargs={"num_crops": 16},
110 )
111 stop_token_ids = None
112 return llm, prompt, stop_token_ids
113
114
115# PaliGemma
116def run_paligemma(question: str, modality: str):
117 assert modality == "image"
118
119 # PaliGemma has special prompt format for VQA
120 prompt = "caption en"
121 llm = LLM(model="google/paligemma-3b-mix-224")
122 stop_token_ids = None
123 return llm, prompt, stop_token_ids
124
125
126# Chameleon
127def run_chameleon(question: str, modality: str):
128 assert modality == "image"
129
130 prompt = f"{question}<image>"
131 llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
132 stop_token_ids = None
133 return llm, prompt, stop_token_ids
134
135
136# MiniCPM-V
137def run_minicpmv(question: str, modality: str):
138 assert modality == "image"
139
140 # 2.0
141 # The official repo doesn't work yet, so we need to use a fork for now
142 # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
143 # model_name = "HwwwH/MiniCPM-V-2"
144
145 # 2.5
146 # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
147
148 #2.6
149 model_name = "openbmb/MiniCPM-V-2_6"
150 tokenizer = AutoTokenizer.from_pretrained(model_name,
151 trust_remote_code=True)
152 llm = LLM(
153 model=model_name,
154 max_model_len=4096,
155 max_num_seqs=2,
156 trust_remote_code=True,
157 )
158 # NOTE The stop_token_ids are different for various versions of MiniCPM-V
159 # 2.0
160 # stop_token_ids = [tokenizer.eos_id]
161
162 # 2.5
163 # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
164
165 # 2.6
166 stop_tokens = ['<|im_end|>', '<|endoftext|>']
167 stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
168
169 messages = [{
170 'role': 'user',
171 'content': f'(<image>./</image>)\n{question}'
172 }]
173 prompt = tokenizer.apply_chat_template(messages,
174 tokenize=False,
175 add_generation_prompt=True)
176 return llm, prompt, stop_token_ids
177
178
179# H2OVL-Mississippi
180def run_h2ovl(question: str, modality: str):
181 assert modality == "image"
182
183 model_name = "h2oai/h2ovl-mississippi-2b"
184
185 llm = LLM(
186 model=model_name,
187 trust_remote_code=True,
188 max_model_len=8192,
189 )
190
191 tokenizer = AutoTokenizer.from_pretrained(model_name,
192 trust_remote_code=True)
193 messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
194 prompt = tokenizer.apply_chat_template(messages,
195 tokenize=False,
196 add_generation_prompt=True)
197
198 # Stop tokens for H2OVL-Mississippi
199 # https://huggingface.co/h2oai/h2ovl-mississippi-2b
200 stop_token_ids = [tokenizer.eos_token_id]
201 return llm, prompt, stop_token_ids
202
203
204# InternVL
205def run_internvl(question: str, modality: str):
206 assert modality == "image"
207
208 model_name = "OpenGVLab/InternVL2-2B"
209
210 llm = LLM(
211 model=model_name,
212 trust_remote_code=True,
213 max_model_len=4096,
214 )
215
216 tokenizer = AutoTokenizer.from_pretrained(model_name,
217 trust_remote_code=True)
218 messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
219 prompt = tokenizer.apply_chat_template(messages,
220 tokenize=False,
221 add_generation_prompt=True)
222
223 # Stop tokens for InternVL
224 # models variants may have different stop tokens
225 # please refer to the model card for the correct "stop words":
226 # https://huggingface.co/OpenGVLab/InternVL2-2B#service
227 stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
228 stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
229 return llm, prompt, stop_token_ids
230
231
232# NVLM-D
233def run_nvlm_d(question: str, modality: str):
234 assert modality == "image"
235
236 model_name = "nvidia/NVLM-D-72B"
237
238 # Adjust this as necessary to fit in GPU
239 llm = LLM(
240 model=model_name,
241 trust_remote_code=True,
242 max_model_len=4096,
243 tensor_parallel_size=4,
244 )
245
246 tokenizer = AutoTokenizer.from_pretrained(model_name,
247 trust_remote_code=True)
248 messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
249 prompt = tokenizer.apply_chat_template(messages,
250 tokenize=False,
251 add_generation_prompt=True)
252 stop_token_ids = None
253 return llm, prompt, stop_token_ids
254
255
256# BLIP-2
257def run_blip2(question: str, modality: str):
258 assert modality == "image"
259
260 # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
261 # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
262 prompt = f"Question: {question} Answer:"
263 llm = LLM(model="Salesforce/blip2-opt-2.7b")
264 stop_token_ids = None
265 return llm, prompt, stop_token_ids
266
267
268# Qwen
269def run_qwen_vl(question: str, modality: str):
270 assert modality == "image"
271
272 llm = LLM(
273 model="Qwen/Qwen-VL",
274 trust_remote_code=True,
275 max_model_len=1024,
276 max_num_seqs=2,
277 )
278
279 prompt = f"{question}Picture 1: <img></img>\n"
280 stop_token_ids = None
281 return llm, prompt, stop_token_ids
282
283
284# Qwen2-VL
285def run_qwen2_vl(question: str, modality: str):
286 assert modality == "image"
287
288 model_name = "Qwen/Qwen2-VL-7B-Instruct"
289
290 llm = LLM(
291 model=model_name,
292 max_model_len=4096,
293 max_num_seqs=5,
294 # Note - mm_processor_kwargs can also be passed to generate/chat calls
295 mm_processor_kwargs={
296 "min_pixels": 28 * 28,
297 "max_pixels": 1280 * 28 * 28,
298 },
299 )
300
301 prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
302 "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
303 f"{question}<|im_end|>\n"
304 "<|im_start|>assistant\n")
305 stop_token_ids = None
306 return llm, prompt, stop_token_ids
307
308
309# Pixtral HF-format
310def run_pixtral_hf(question: str, modality: str):
311 assert modality == "image"
312
313 model_name = "mistral-community/pixtral-12b"
314
315 llm = LLM(
316 model=model_name,
317 max_model_len=8192,
318 )
319
320 prompt = f"<s>[INST]{question}\n[IMG][/INST]"
321 stop_token_ids = None
322 return llm, prompt, stop_token_ids
323
324
325# LLama 3.2
326def run_mllama(question: str, modality: str):
327 assert modality == "image"
328
329 model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
330
331 # Note: The default setting of max_num_seqs (256) and
332 # max_model_len (131072) for this model may cause OOM.
333 # You may lower either to run this example on lower-end GPUs.
334
335 # The configuration below has been confirmed to launch on a single L40 GPU.
336 llm = LLM(
337 model=model_name,
338 max_model_len=4096,
339 max_num_seqs=16,
340 enforce_eager=True,
341 )
342
343 prompt = f"<|image|><|begin_of_text|>{question}"
344 stop_token_ids = None
345 return llm, prompt, stop_token_ids
346
347
348# Molmo
349def run_molmo(question, modality):
350 assert modality == "image"
351
352 model_name = "allenai/Molmo-7B-D-0924"
353
354 llm = LLM(
355 model=model_name,
356 trust_remote_code=True,
357 dtype="bfloat16",
358 )
359
360 prompt = question
361 stop_token_ids = None
362 return llm, prompt, stop_token_ids
363
364
365# GLM-4v
366def run_glm4v(question: str, modality: str):
367 assert modality == "image"
368 model_name = "THUDM/glm-4v-9b"
369
370 llm = LLM(model=model_name,
371 max_model_len=2048,
372 max_num_seqs=2,
373 trust_remote_code=True,
374 enforce_eager=True)
375 prompt = question
376 stop_token_ids = [151329, 151336, 151338]
377 return llm, prompt, stop_token_ids
378
379
380# Idefics3-8B-Llama3
381def run_idefics3(question: str, modality: str):
382 assert modality == "image"
383 model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
384
385 llm = LLM(
386 model=model_name,
387 max_model_len=8192,
388 max_num_seqs=2,
389 enforce_eager=True,
390 # if you are running out of memory, you can reduce the "longest_edge".
391 # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
392 mm_processor_kwargs={
393 "size": {
394 "longest_edge": 3 * 364
395 },
396 },
397 )
398 prompt = (
399 f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
400 )
401 stop_token_ids = None
402 return llm, prompt, stop_token_ids
403
404
405model_example_map = {
406 "llava": run_llava,
407 "llava-next": run_llava_next,
408 "llava-next-video": run_llava_next_video,
409 "llava-onevision": run_llava_onevision,
410 "fuyu": run_fuyu,
411 "phi3_v": run_phi3v,
412 "paligemma": run_paligemma,
413 "chameleon": run_chameleon,
414 "minicpmv": run_minicpmv,
415 "blip-2": run_blip2,
416 "h2ovl_chat": run_h2ovl,
417 "internvl_chat": run_internvl,
418 "NVLM_D": run_nvlm_d,
419 "qwen_vl": run_qwen_vl,
420 "qwen2_vl": run_qwen2_vl,
421 "pixtral_hf": run_pixtral_hf,
422 "mllama": run_mllama,
423 "molmo": run_molmo,
424 "glm4v": run_glm4v,
425 "idefics3": run_idefics3,
426}
427
428
429def get_multi_modal_input(args):
430 """
431 return {
432 "data": image or video,
433 "question": question,
434 }
435 """
436 if args.modality == "image":
437 # Input image and question
438 image = ImageAsset("cherry_blossom") \
439 .pil_image.convert("RGB")
440 img_question = "What is the content of this image?"
441
442 return {
443 "data": image,
444 "question": img_question,
445 }
446
447 if args.modality == "video":
448 # Input video and question
449 video = VideoAsset(name="sample_demo_1.mp4",
450 num_frames=args.num_frames).np_ndarrays
451 vid_question = "Why is this video funny?"
452
453 return {
454 "data": video,
455 "question": vid_question,
456 }
457
458 msg = f"Modality {args.modality} is not supported."
459 raise ValueError(msg)
460
461
462def main(args):
463 model = args.model_type
464 if model not in model_example_map:
465 raise ValueError(f"Model type {model} is not supported.")
466
467 modality = args.modality
468 mm_input = get_multi_modal_input(args)
469 data = mm_input["data"]
470 question = mm_input["question"]
471
472 llm, prompt, stop_token_ids = model_example_map[model](question, modality)
473
474 # We set temperature to 0.2 so that outputs can be different
475 # even when all prompts are identical when running batch inference.
476 sampling_params = SamplingParams(temperature=0.2,
477 max_tokens=64,
478 stop_token_ids=stop_token_ids)
479
480 assert args.num_prompts > 0
481 if args.num_prompts == 1:
482 # Single inference
483 inputs = {
484 "prompt": prompt,
485 "multi_modal_data": {
486 modality: data
487 },
488 }
489
490 else:
491 # Batch inference
492 inputs = [{
493 "prompt": prompt,
494 "multi_modal_data": {
495 modality: data
496 },
497 } for _ in range(args.num_prompts)]
498
499 outputs = llm.generate(inputs, sampling_params=sampling_params)
500
501 for o in outputs:
502 generated_text = o.outputs[0].text
503 print(generated_text)
504
505
506if __name__ == "__main__":
507 parser = FlexibleArgumentParser(
508 description='Demo on using vLLM for offline inference with '
509 'vision language models for text generation')
510 parser.add_argument('--model-type',
511 '-m',
512 type=str,
513 default="llava",
514 choices=model_example_map.keys(),
515 help='Huggingface "model_type".')
516 parser.add_argument('--num-prompts',
517 type=int,
518 default=4,
519 help='Number of prompts to run.')
520 parser.add_argument('--modality',
521 type=str,
522 default="image",
523 choices=['image', 'video'],
524 help='Modality of the input.')
525 parser.add_argument('--num-frames',
526 type=int,
527 default=16,
528 help='Number of frames to extract from the video.')
529 args = parser.parse_args()
530 main(args)