Offline Inference Vision Language

Offline Inference Vision Language#

Source vllm-project/vllm.

  1"""
  2This example shows how to use vLLM for running offline inference with
  3the correct prompt format on vision language models for text generation.
  4
  5For most models, the prompt format should follow corresponding examples
  6on HuggingFace model repository.
  7"""
  8from transformers import AutoTokenizer
  9
 10from vllm import LLM, SamplingParams
 11from vllm.assets.image import ImageAsset
 12from vllm.assets.video import VideoAsset
 13from vllm.utils import FlexibleArgumentParser
 14
 15# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 16# lower-end GPUs.
 17# Unless specified, these settings have been tested to work on a single L4.
 18
 19
 20# LLaVA-1.5
 21def run_llava(question: str, modality: str):
 22    assert modality == "image"
 23
 24    prompt = f"USER: <image>\n{question}\nASSISTANT:"
 25
 26    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
 27    stop_token_ids = None
 28    return llm, prompt, stop_token_ids
 29
 30
 31# LLaVA-1.6/LLaVA-NeXT
 32def run_llava_next(question: str, modality: str):
 33    assert modality == "image"
 34
 35    prompt = f"[INST] <image>\n{question} [/INST]"
 36    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
 37    stop_token_ids = None
 38    return llm, prompt, stop_token_ids
 39
 40
 41# LlaVA-NeXT-Video
 42# Currently only support for video input
 43def run_llava_next_video(question: str, modality: str):
 44    assert modality == "video"
 45
 46    prompt = f"USER: <video>\n{question} ASSISTANT:"
 47    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
 48    stop_token_ids = None
 49    return llm, prompt, stop_token_ids
 50
 51
 52# LLaVA-OneVision
 53def run_llava_onevision(question: str, modality: str):
 54
 55    if modality == "video":
 56        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
 57        <|im_start|>assistant\n"
 58
 59    elif modality == "image":
 60        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
 61        <|im_start|>assistant\n"
 62
 63    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
 64              max_model_len=16384)
 65    stop_token_ids = None
 66    return llm, prompt, stop_token_ids
 67
 68
 69# Fuyu
 70def run_fuyu(question: str, modality: str):
 71    assert modality == "image"
 72
 73    prompt = f"{question}\n"
 74    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 75    stop_token_ids = None
 76    return llm, prompt, stop_token_ids
 77
 78
 79# Phi-3-Vision
 80def run_phi3v(question: str, modality: str):
 81    assert modality == "image"
 82
 83    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
 84    # Note: The default setting of max_num_seqs (256) and
 85    # max_model_len (128k) for this model may cause OOM.
 86    # You may lower either to run this example on lower-end GPUs.
 87
 88    # In this example, we override max_num_seqs to 5 while
 89    # keeping the original context length of 128k.
 90
 91    # num_crops is an override kwarg to the multimodal image processor;
 92    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
 93    # to use 16 for single frame scenarios, and 4 for multi-frame.
 94    #
 95    # Generally speaking, a larger value for num_crops results in more
 96    # tokens per image instance, because it may scale the image more in
 97    # the image preprocessing. Some references in the model docs and the
 98    # formula for image tokens after the preprocessing
 99    # transform can be found below.
100    #
101    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
102    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
103    llm = LLM(
104        model="microsoft/Phi-3-vision-128k-instruct",
105        trust_remote_code=True,
106        max_model_len=4096,
107        max_num_seqs=2,
108        # Note - mm_processor_kwargs can also be passed to generate/chat calls
109        mm_processor_kwargs={"num_crops": 16},
110    )
111    stop_token_ids = None
112    return llm, prompt, stop_token_ids
113
114
115# PaliGemma
116def run_paligemma(question: str, modality: str):
117    assert modality == "image"
118
119    # PaliGemma has special prompt format for VQA
120    prompt = "caption en"
121    llm = LLM(model="google/paligemma-3b-mix-224")
122    stop_token_ids = None
123    return llm, prompt, stop_token_ids
124
125
126# Chameleon
127def run_chameleon(question: str, modality: str):
128    assert modality == "image"
129
130    prompt = f"{question}<image>"
131    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
132    stop_token_ids = None
133    return llm, prompt, stop_token_ids
134
135
136# MiniCPM-V
137def run_minicpmv(question: str, modality: str):
138    assert modality == "image"
139
140    # 2.0
141    # The official repo doesn't work yet, so we need to use a fork for now
142    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
143    # model_name = "HwwwH/MiniCPM-V-2"
144
145    # 2.5
146    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
147
148    #2.6
149    model_name = "openbmb/MiniCPM-V-2_6"
150    tokenizer = AutoTokenizer.from_pretrained(model_name,
151                                              trust_remote_code=True)
152    llm = LLM(
153        model=model_name,
154        max_model_len=4096,
155        max_num_seqs=2,
156        trust_remote_code=True,
157    )
158    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
159    # 2.0
160    # stop_token_ids = [tokenizer.eos_id]
161
162    # 2.5
163    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
164
165    # 2.6
166    stop_tokens = ['<|im_end|>', '<|endoftext|>']
167    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
168
169    messages = [{
170        'role': 'user',
171        'content': f'(<image>./</image>)\n{question}'
172    }]
173    prompt = tokenizer.apply_chat_template(messages,
174                                           tokenize=False,
175                                           add_generation_prompt=True)
176    return llm, prompt, stop_token_ids
177
178
179# H2OVL-Mississippi
180def run_h2ovl(question: str, modality: str):
181    assert modality == "image"
182
183    model_name = "h2oai/h2ovl-mississippi-2b"
184
185    llm = LLM(
186        model=model_name,
187        trust_remote_code=True,
188        max_model_len=8192,
189    )
190
191    tokenizer = AutoTokenizer.from_pretrained(model_name,
192                                              trust_remote_code=True)
193    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
194    prompt = tokenizer.apply_chat_template(messages,
195                                           tokenize=False,
196                                           add_generation_prompt=True)
197
198    # Stop tokens for H2OVL-Mississippi
199    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
200    stop_token_ids = [tokenizer.eos_token_id]
201    return llm, prompt, stop_token_ids
202
203
204# InternVL
205def run_internvl(question: str, modality: str):
206    assert modality == "image"
207
208    model_name = "OpenGVLab/InternVL2-2B"
209
210    llm = LLM(
211        model=model_name,
212        trust_remote_code=True,
213        max_model_len=4096,
214    )
215
216    tokenizer = AutoTokenizer.from_pretrained(model_name,
217                                              trust_remote_code=True)
218    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
219    prompt = tokenizer.apply_chat_template(messages,
220                                           tokenize=False,
221                                           add_generation_prompt=True)
222
223    # Stop tokens for InternVL
224    # models variants may have different stop tokens
225    # please refer to the model card for the correct "stop words":
226    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
227    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
228    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
229    return llm, prompt, stop_token_ids
230
231
232# NVLM-D
233def run_nvlm_d(question: str, modality: str):
234    assert modality == "image"
235
236    model_name = "nvidia/NVLM-D-72B"
237
238    # Adjust this as necessary to fit in GPU
239    llm = LLM(
240        model=model_name,
241        trust_remote_code=True,
242        max_model_len=4096,
243        tensor_parallel_size=4,
244    )
245
246    tokenizer = AutoTokenizer.from_pretrained(model_name,
247                                              trust_remote_code=True)
248    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
249    prompt = tokenizer.apply_chat_template(messages,
250                                           tokenize=False,
251                                           add_generation_prompt=True)
252    stop_token_ids = None
253    return llm, prompt, stop_token_ids
254
255
256# BLIP-2
257def run_blip2(question: str, modality: str):
258    assert modality == "image"
259
260    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
261    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
262    prompt = f"Question: {question} Answer:"
263    llm = LLM(model="Salesforce/blip2-opt-2.7b")
264    stop_token_ids = None
265    return llm, prompt, stop_token_ids
266
267
268# Qwen
269def run_qwen_vl(question: str, modality: str):
270    assert modality == "image"
271
272    llm = LLM(
273        model="Qwen/Qwen-VL",
274        trust_remote_code=True,
275        max_model_len=1024,
276        max_num_seqs=2,
277    )
278
279    prompt = f"{question}Picture 1: <img></img>\n"
280    stop_token_ids = None
281    return llm, prompt, stop_token_ids
282
283
284# Qwen2-VL
285def run_qwen2_vl(question: str, modality: str):
286    assert modality == "image"
287
288    model_name = "Qwen/Qwen2-VL-7B-Instruct"
289
290    llm = LLM(
291        model=model_name,
292        max_model_len=4096,
293        max_num_seqs=5,
294        # Note - mm_processor_kwargs can also be passed to generate/chat calls
295        mm_processor_kwargs={
296            "min_pixels": 28 * 28,
297            "max_pixels": 1280 * 28 * 28,
298        },
299    )
300
301    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
302              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
303              f"{question}<|im_end|>\n"
304              "<|im_start|>assistant\n")
305    stop_token_ids = None
306    return llm, prompt, stop_token_ids
307
308
309# Pixtral HF-format
310def run_pixtral_hf(question: str, modality: str):
311    assert modality == "image"
312
313    model_name = "mistral-community/pixtral-12b"
314
315    llm = LLM(
316        model=model_name,
317        max_model_len=8192,
318    )
319
320    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
321    stop_token_ids = None
322    return llm, prompt, stop_token_ids
323
324
325# LLama 3.2
326def run_mllama(question: str, modality: str):
327    assert modality == "image"
328
329    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
330
331    # Note: The default setting of max_num_seqs (256) and
332    # max_model_len (131072) for this model may cause OOM.
333    # You may lower either to run this example on lower-end GPUs.
334
335    # The configuration below has been confirmed to launch on a single L40 GPU.
336    llm = LLM(
337        model=model_name,
338        max_model_len=4096,
339        max_num_seqs=16,
340        enforce_eager=True,
341    )
342
343    prompt = f"<|image|><|begin_of_text|>{question}"
344    stop_token_ids = None
345    return llm, prompt, stop_token_ids
346
347
348# Molmo
349def run_molmo(question, modality):
350    assert modality == "image"
351
352    model_name = "allenai/Molmo-7B-D-0924"
353
354    llm = LLM(
355        model=model_name,
356        trust_remote_code=True,
357        dtype="bfloat16",
358    )
359
360    prompt = question
361    stop_token_ids = None
362    return llm, prompt, stop_token_ids
363
364
365# GLM-4v
366def run_glm4v(question: str, modality: str):
367    assert modality == "image"
368    model_name = "THUDM/glm-4v-9b"
369
370    llm = LLM(model=model_name,
371              max_model_len=2048,
372              max_num_seqs=2,
373              trust_remote_code=True,
374              enforce_eager=True)
375    prompt = question
376    stop_token_ids = [151329, 151336, 151338]
377    return llm, prompt, stop_token_ids
378
379
380# Idefics3-8B-Llama3
381def run_idefics3(question: str, modality: str):
382    assert modality == "image"
383    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
384
385    llm = LLM(
386        model=model_name,
387        max_model_len=8192,
388        max_num_seqs=2,
389        enforce_eager=True,
390        # if you are running out of memory, you can reduce the "longest_edge".
391        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
392        mm_processor_kwargs={
393            "size": {
394                "longest_edge": 3 * 364
395            },
396        },
397    )
398    prompt = (
399        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
400    )
401    stop_token_ids = None
402    return llm, prompt, stop_token_ids
403
404
405model_example_map = {
406    "llava": run_llava,
407    "llava-next": run_llava_next,
408    "llava-next-video": run_llava_next_video,
409    "llava-onevision": run_llava_onevision,
410    "fuyu": run_fuyu,
411    "phi3_v": run_phi3v,
412    "paligemma": run_paligemma,
413    "chameleon": run_chameleon,
414    "minicpmv": run_minicpmv,
415    "blip-2": run_blip2,
416    "h2ovl_chat": run_h2ovl,
417    "internvl_chat": run_internvl,
418    "NVLM_D": run_nvlm_d,
419    "qwen_vl": run_qwen_vl,
420    "qwen2_vl": run_qwen2_vl,
421    "pixtral_hf": run_pixtral_hf,
422    "mllama": run_mllama,
423    "molmo": run_molmo,
424    "glm4v": run_glm4v,
425    "idefics3": run_idefics3,
426}
427
428
429def get_multi_modal_input(args):
430    """
431    return {
432        "data": image or video,
433        "question": question,
434    }
435    """
436    if args.modality == "image":
437        # Input image and question
438        image = ImageAsset("cherry_blossom") \
439            .pil_image.convert("RGB")
440        img_question = "What is the content of this image?"
441
442        return {
443            "data": image,
444            "question": img_question,
445        }
446
447    if args.modality == "video":
448        # Input video and question
449        video = VideoAsset(name="sample_demo_1.mp4",
450                           num_frames=args.num_frames).np_ndarrays
451        vid_question = "Why is this video funny?"
452
453        return {
454            "data": video,
455            "question": vid_question,
456        }
457
458    msg = f"Modality {args.modality} is not supported."
459    raise ValueError(msg)
460
461
462def main(args):
463    model = args.model_type
464    if model not in model_example_map:
465        raise ValueError(f"Model type {model} is not supported.")
466
467    modality = args.modality
468    mm_input = get_multi_modal_input(args)
469    data = mm_input["data"]
470    question = mm_input["question"]
471
472    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
473
474    # We set temperature to 0.2 so that outputs can be different
475    # even when all prompts are identical when running batch inference.
476    sampling_params = SamplingParams(temperature=0.2,
477                                     max_tokens=64,
478                                     stop_token_ids=stop_token_ids)
479
480    assert args.num_prompts > 0
481    if args.num_prompts == 1:
482        # Single inference
483        inputs = {
484            "prompt": prompt,
485            "multi_modal_data": {
486                modality: data
487            },
488        }
489
490    else:
491        # Batch inference
492        inputs = [{
493            "prompt": prompt,
494            "multi_modal_data": {
495                modality: data
496            },
497        } for _ in range(args.num_prompts)]
498
499    outputs = llm.generate(inputs, sampling_params=sampling_params)
500
501    for o in outputs:
502        generated_text = o.outputs[0].text
503        print(generated_text)
504
505
506if __name__ == "__main__":
507    parser = FlexibleArgumentParser(
508        description='Demo on using vLLM for offline inference with '
509        'vision language models for text generation')
510    parser.add_argument('--model-type',
511                        '-m',
512                        type=str,
513                        default="llava",
514                        choices=model_example_map.keys(),
515                        help='Huggingface "model_type".')
516    parser.add_argument('--num-prompts',
517                        type=int,
518                        default=4,
519                        help='Number of prompts to run.')
520    parser.add_argument('--modality',
521                        type=str,
522                        default="image",
523                        choices=['image', 'video'],
524                        help='Modality of the input.')
525    parser.add_argument('--num-frames',
526                        type=int,
527                        default=16,
528                        help='Number of frames to extract from the video.')
529    args = parser.parse_args()
530    main(args)