Offline Inference Vision Language#

Source vllm-project/vllm.

  1"""
  2This example shows how to use vLLM for running offline inference 
  3with the correct prompt format on vision language models.
  4
  5For most models, the prompt format should follow corresponding examples
  6on HuggingFace model repository.
  7"""
  8from transformers import AutoTokenizer
  9
 10from vllm import LLM, SamplingParams
 11from vllm.assets.image import ImageAsset
 12from vllm.assets.video import VideoAsset
 13from vllm.utils import FlexibleArgumentParser
 14
 15# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 16# lower-end GPUs.
 17# Unless specified, these settings have been tested to work on a single L4.
 18
 19
 20# LLaVA-1.5
 21def run_llava(question: str, modality: str):
 22    assert modality == "image"
 23
 24    prompt = f"USER: <image>\n{question}\nASSISTANT:"
 25
 26    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
 27    stop_token_ids = None
 28    return llm, prompt, stop_token_ids
 29
 30
 31# LLaVA-1.6/LLaVA-NeXT
 32def run_llava_next(question: str, modality: str):
 33    assert modality == "image"
 34
 35    prompt = f"[INST] <image>\n{question} [/INST]"
 36    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
 37    stop_token_ids = None
 38    return llm, prompt, stop_token_ids
 39
 40
 41# LlaVA-NeXT-Video
 42# Currently only support for video input
 43def run_llava_next_video(question: str, modality: str):
 44    assert modality == "video"
 45
 46    prompt = f"USER: <video>\n{question} ASSISTANT:"
 47    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
 48    stop_token_ids = None
 49    return llm, prompt, stop_token_ids
 50
 51
 52# LLaVA-OneVision
 53def run_llava_onevision(question: str, modality: str):
 54
 55    if modality == "video":
 56        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
 57        <|im_start|>assistant\n"
 58
 59    elif modality == "image":
 60        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
 61        <|im_start|>assistant\n"
 62
 63    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
 64              max_model_len=16384)
 65    stop_token_ids = None
 66    return llm, prompt, stop_token_ids
 67
 68
 69# Fuyu
 70def run_fuyu(question: str, modality: str):
 71    assert modality == "image"
 72
 73    prompt = f"{question}\n"
 74    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 75    stop_token_ids = None
 76    return llm, prompt, stop_token_ids
 77
 78
 79# Phi-3-Vision
 80def run_phi3v(question: str, modality: str):
 81    assert modality == "image"
 82
 83    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
 84    # Note: The default setting of max_num_seqs (256) and
 85    # max_model_len (128k) for this model may cause OOM.
 86    # You may lower either to run this example on lower-end GPUs.
 87
 88    # In this example, we override max_num_seqs to 5 while
 89    # keeping the original context length of 128k.
 90
 91    # num_crops is an override kwarg to the multimodal image processor;
 92    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
 93    # to use 16 for single frame scenarios, and 4 for multi-frame.
 94    #
 95    # Generally speaking, a larger value for num_crops results in more
 96    # tokens per image instance, because it may scale the image more in
 97    # the image preprocessing. Some references in the model docs and the
 98    # formula for image tokens after the preprocessing
 99    # transform can be found below.
100    #
101    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
102    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
103    llm = LLM(
104        model="microsoft/Phi-3-vision-128k-instruct",
105        trust_remote_code=True,
106        max_model_len=4096,
107        max_num_seqs=2,
108        # Note - mm_processor_kwargs can also be passed to generate/chat calls
109        mm_processor_kwargs={"num_crops": 16},
110    )
111    stop_token_ids = None
112    return llm, prompt, stop_token_ids
113
114
115# PaliGemma
116def run_paligemma(question: str, modality: str):
117    assert modality == "image"
118
119    # PaliGemma has special prompt format for VQA
120    prompt = "caption en"
121    llm = LLM(model="google/paligemma-3b-mix-224")
122    stop_token_ids = None
123    return llm, prompt, stop_token_ids
124
125
126# Chameleon
127def run_chameleon(question: str, modality: str):
128    assert modality == "image"
129
130    prompt = f"{question}<image>"
131    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
132    stop_token_ids = None
133    return llm, prompt, stop_token_ids
134
135
136# MiniCPM-V
137def run_minicpmv(question: str, modality: str):
138    assert modality == "image"
139
140    # 2.0
141    # The official repo doesn't work yet, so we need to use a fork for now
142    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
143    # model_name = "HwwwH/MiniCPM-V-2"
144
145    # 2.5
146    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
147
148    #2.6
149    model_name = "openbmb/MiniCPM-V-2_6"
150    tokenizer = AutoTokenizer.from_pretrained(model_name,
151                                              trust_remote_code=True)
152    llm = LLM(
153        model=model_name,
154        max_model_len=4096,
155        max_num_seqs=2,
156        trust_remote_code=True,
157    )
158    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
159    # 2.0
160    # stop_token_ids = [tokenizer.eos_id]
161
162    # 2.5
163    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
164
165    # 2.6
166    stop_tokens = ['<|im_end|>', '<|endoftext|>']
167    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
168
169    messages = [{
170        'role': 'user',
171        'content': f'(<image>./</image>)\n{question}'
172    }]
173    prompt = tokenizer.apply_chat_template(messages,
174                                           tokenize=False,
175                                           add_generation_prompt=True)
176    return llm, prompt, stop_token_ids
177
178
179# InternVL
180def run_internvl(question: str, modality: str):
181    assert modality == "image"
182
183    model_name = "OpenGVLab/InternVL2-2B"
184
185    llm = LLM(
186        model=model_name,
187        trust_remote_code=True,
188        max_model_len=4096,
189    )
190
191    tokenizer = AutoTokenizer.from_pretrained(model_name,
192                                              trust_remote_code=True)
193    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
194    prompt = tokenizer.apply_chat_template(messages,
195                                           tokenize=False,
196                                           add_generation_prompt=True)
197
198    # Stop tokens for InternVL
199    # models variants may have different stop tokens
200    # please refer to the model card for the correct "stop words":
201    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
202    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
203    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
204    return llm, prompt, stop_token_ids
205
206
207# NVLM-D
208def run_nvlm_d(question: str, modality: str):
209    assert modality == "image"
210
211    model_name = "nvidia/NVLM-D-72B"
212
213    # Adjust this as necessary to fit in GPU
214    llm = LLM(
215        model=model_name,
216        trust_remote_code=True,
217        max_model_len=4096,
218        tensor_parallel_size=4,
219    )
220
221    tokenizer = AutoTokenizer.from_pretrained(model_name,
222                                              trust_remote_code=True)
223    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
224    prompt = tokenizer.apply_chat_template(messages,
225                                           tokenize=False,
226                                           add_generation_prompt=True)
227    stop_token_ids = None
228    return llm, prompt, stop_token_ids
229
230
231# BLIP-2
232def run_blip2(question: str, modality: str):
233    assert modality == "image"
234
235    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
236    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
237    prompt = f"Question: {question} Answer:"
238    llm = LLM(model="Salesforce/blip2-opt-2.7b")
239    stop_token_ids = None
240    return llm, prompt, stop_token_ids
241
242
243# Qwen
244def run_qwen_vl(question: str, modality: str):
245    assert modality == "image"
246
247    llm = LLM(
248        model="Qwen/Qwen-VL",
249        trust_remote_code=True,
250        max_model_len=1024,
251        max_num_seqs=2,
252    )
253
254    prompt = f"{question}Picture 1: <img></img>\n"
255    stop_token_ids = None
256    return llm, prompt, stop_token_ids
257
258
259# Qwen2-VL
260def run_qwen2_vl(question: str, modality: str):
261    assert modality == "image"
262
263    model_name = "Qwen/Qwen2-VL-7B-Instruct"
264
265    # Tested on L40
266    llm = LLM(
267        model=model_name,
268        max_model_len=8192,
269        max_num_seqs=5,
270    )
271
272    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
273              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
274              f"{question}<|im_end|>\n"
275              "<|im_start|>assistant\n")
276    stop_token_ids = None
277    return llm, prompt, stop_token_ids
278
279
280# LLama 3.2
281def run_mllama(question: str, modality: str):
282    assert modality == "image"
283
284    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
285
286    # Note: The default setting of max_num_seqs (256) and
287    # max_model_len (131072) for this model may cause OOM.
288    # You may lower either to run this example on lower-end GPUs.
289
290    # The configuration below has been confirmed to launch on a single L40 GPU.
291    llm = LLM(
292        model=model_name,
293        max_model_len=4096,
294        max_num_seqs=16,
295        enforce_eager=True,
296    )
297
298    prompt = f"<|image|><|begin_of_text|>{question}"
299    stop_token_ids = None
300    return llm, prompt, stop_token_ids
301
302
303# Molmo
304def run_molmo(question, modality):
305    assert modality == "image"
306
307    model_name = "allenai/Molmo-7B-D-0924"
308
309    llm = LLM(
310        model=model_name,
311        trust_remote_code=True,
312        dtype="bfloat16",
313    )
314
315    prompt = question
316    stop_token_ids = None
317    return llm, prompt, stop_token_ids
318
319
320# GLM-4v
321def run_glm4v(question: str, modality: str):
322    assert modality == "image"
323    model_name = "THUDM/glm-4v-9b"
324
325    llm = LLM(model=model_name,
326              max_model_len=2048,
327              max_num_seqs=2,
328              trust_remote_code=True,
329              enforce_eager=True)
330    prompt = question
331    stop_token_ids = [151329, 151336, 151338]
332    return llm, prompt, stop_token_ids
333
334
335model_example_map = {
336    "llava": run_llava,
337    "llava-next": run_llava_next,
338    "llava-next-video": run_llava_next_video,
339    "llava-onevision": run_llava_onevision,
340    "fuyu": run_fuyu,
341    "phi3_v": run_phi3v,
342    "paligemma": run_paligemma,
343    "chameleon": run_chameleon,
344    "minicpmv": run_minicpmv,
345    "blip-2": run_blip2,
346    "internvl_chat": run_internvl,
347    "NVLM_D": run_nvlm_d,
348    "qwen_vl": run_qwen_vl,
349    "qwen2_vl": run_qwen2_vl,
350    "mllama": run_mllama,
351    "molmo": run_molmo,
352    "glm4v": run_glm4v,
353}
354
355
356def get_multi_modal_input(args):
357    """
358    return {
359        "data": image or video,
360        "question": question,
361    }
362    """
363    if args.modality == "image":
364        # Input image and question
365        image = ImageAsset("cherry_blossom") \
366            .pil_image.convert("RGB")
367        img_question = "What is the content of this image?"
368
369        return {
370            "data": image,
371            "question": img_question,
372        }
373
374    if args.modality == "video":
375        # Input video and question
376        video = VideoAsset(name="sample_demo_1.mp4",
377                           num_frames=args.num_frames).np_ndarrays
378        vid_question = "Why is this video funny?"
379
380        return {
381            "data": video,
382            "question": vid_question,
383        }
384
385    msg = f"Modality {args.modality} is not supported."
386    raise ValueError(msg)
387
388
389def main(args):
390    model = args.model_type
391    if model not in model_example_map:
392        raise ValueError(f"Model type {model} is not supported.")
393
394    modality = args.modality
395    mm_input = get_multi_modal_input(args)
396    data = mm_input["data"]
397    question = mm_input["question"]
398
399    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
400
401    # We set temperature to 0.2 so that outputs can be different
402    # even when all prompts are identical when running batch inference.
403    sampling_params = SamplingParams(temperature=0.2,
404                                     max_tokens=64,
405                                     stop_token_ids=stop_token_ids)
406
407    assert args.num_prompts > 0
408    if args.num_prompts == 1:
409        # Single inference
410        inputs = {
411            "prompt": prompt,
412            "multi_modal_data": {
413                modality: data
414            },
415        }
416
417    else:
418        # Batch inference
419        inputs = [{
420            "prompt": prompt,
421            "multi_modal_data": {
422                modality: data
423            },
424        } for _ in range(args.num_prompts)]
425
426    outputs = llm.generate(inputs, sampling_params=sampling_params)
427
428    for o in outputs:
429        generated_text = o.outputs[0].text
430        print(generated_text)
431
432
433if __name__ == "__main__":
434    parser = FlexibleArgumentParser(
435        description='Demo on using vLLM for offline inference with '
436        'vision language models')
437    parser.add_argument('--model-type',
438                        '-m',
439                        type=str,
440                        default="llava",
441                        choices=model_example_map.keys(),
442                        help='Huggingface "model_type".')
443    parser.add_argument('--num-prompts',
444                        type=int,
445                        default=4,
446                        help='Number of prompts to run.')
447    parser.add_argument('--modality',
448                        type=str,
449                        default="image",
450                        choices=['image', 'video'],
451                        help='Modality of the input.')
452    parser.add_argument('--num-frames',
453                        type=int,
454                        default=16,
455                        help='Number of frames to extract from the video.')
456    args = parser.parse_args()
457    main(args)