OpenAI Vision API Client#

Source vllm-project/vllm.

  1"""An example showing how to use vLLM to serve VLMs.
  2
  3Launch the vLLM server with the following command:
  4
  5(single image inference with Llava)
  6vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
  7
  8(multi-image inference with Phi-3.5-vision-instruct)
  9vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
 10    --trust-remote-code --limit-mm-per-prompt image=2
 11"""
 12import base64
 13
 14import requests
 15from openai import OpenAI
 16
 17# Modify OpenAI's API key and API base to use vLLM's API server.
 18openai_api_key = "EMPTY"
 19openai_api_base = "http://localhost:8000/v1"
 20
 21client = OpenAI(
 22    # defaults to os.environ.get("OPENAI_API_KEY")
 23    api_key=openai_api_key,
 24    base_url=openai_api_base,
 25)
 26
 27models = client.models.list()
 28model = models.data[0].id
 29
 30image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 31
 32# Use image url in the payload
 33chat_completion_from_url = client.chat.completions.create(
 34    messages=[{
 35        "role":
 36        "user",
 37        "content": [
 38            {
 39                "type": "text",
 40                "text": "What’s in this image?"
 41            },
 42            {
 43                "type": "image_url",
 44                "image_url": {
 45                    "url": image_url
 46                },
 47            },
 48        ],
 49    }],
 50    model=model,
 51    max_tokens=64,
 52)
 53
 54result = chat_completion_from_url.choices[0].message.content
 55print(f"Chat completion output:{result}")
 56
 57
 58# Use base64 encoded image in the payload
 59def encode_image_base64_from_url(image_url: str) -> str:
 60    """Encode an image retrieved from a remote url to base64 format."""
 61
 62    with requests.get(image_url) as response:
 63        response.raise_for_status()
 64        result = base64.b64encode(response.content).decode('utf-8')
 65
 66    return result
 67
 68
 69image_base64 = encode_image_base64_from_url(image_url=image_url)
 70chat_completion_from_base64 = client.chat.completions.create(
 71    messages=[{
 72        "role":
 73        "user",
 74        "content": [
 75            {
 76                "type": "text",
 77                "text": "What’s in this image?"
 78            },
 79            {
 80                "type": "image_url",
 81                "image_url": {
 82                    "url": f"data:image/jpeg;base64,{image_base64}"
 83                },
 84            },
 85        ],
 86    }],
 87    model=model,
 88    max_tokens=64,
 89)
 90
 91result = chat_completion_from_base64.choices[0].message.content
 92print(f"Chat completion output:{result}")
 93
 94# Multi-image input inference
 95image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
 96image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
 97chat_completion_from_url = client.chat.completions.create(
 98    messages=[{
 99        "role":
100        "user",
101        "content": [
102            {
103                "type": "text",
104                "text": "What are the animals in these images?"
105            },
106            {
107                "type": "image_url",
108                "image_url": {
109                    "url": image_url_duck
110                },
111            },
112            {
113                "type": "image_url",
114                "image_url": {
115                    "url": image_url_lion
116                },
117            },
118        ],
119    }],
120    model=model,
121    max_tokens=64,
122)
123
124result = chat_completion_from_url.choices[0].message.content
125print(f"Chat completion output:{result}")