OpenAI Chat Embedding Client For Multimodal

OpenAI Chat Embedding Client For Multimodal#

Source vllm-project/vllm.

  1import argparse
  2import base64
  3import io
  4
  5import requests
  6from PIL import Image
  7
  8image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
  9
 10
 11def vlm2vec():
 12    response = requests.post(
 13        "http://localhost:8000/v1/embeddings",
 14        json={
 15            "model":
 16            "TIGER-Lab/VLM2Vec-Full",
 17            "messages": [{
 18                "role":
 19                "user",
 20                "content": [
 21                    {
 22                        "type": "image_url",
 23                        "image_url": {
 24                            "url": image_url
 25                        }
 26                    },
 27                    {
 28                        "type": "text",
 29                        "text": "Represent the given image."
 30                    },
 31                ],
 32            }],
 33            "encoding_format":
 34            "float",
 35        },
 36    )
 37    response.raise_for_status()
 38    response_json = response.json()
 39
 40    print("Embedding output:", response_json["data"][0]["embedding"])
 41
 42
 43def dse_qwen2_vl(inp: dict):
 44    # Embedding an Image
 45    if inp["dtype"] == "image":
 46        messages = [{
 47            "role":
 48            "user",
 49            "content": [{
 50                "type": "image_url",
 51                "image_url": {
 52                    "url": inp["image_url"],
 53                }
 54            }, {
 55                "type": "text",
 56                "text": "What is shown in this image?"
 57            }]
 58        }]
 59    # Embedding a Text Query
 60    else:
 61        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
 62        # of the minimum input size
 63        buffer = io.BytesIO()
 64        image_placeholder = Image.new("RGB", (56, 56))
 65        image_placeholder.save(buffer, "png")
 66        buffer.seek(0)
 67        image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
 68        messages = [{
 69            "role":
 70            "user",
 71            "content": [
 72                {
 73                    "type": "image_url",
 74                    "image_url": {
 75                        "url": f"data:image/jpeg;base64,{image_placeholder}",
 76                    }
 77                },
 78                {
 79                    "type": "text",
 80                    "text": f"Query: {inp['content']}"
 81                },
 82            ]
 83        }]
 84
 85    response = requests.post(
 86        "http://localhost:8000/v1/embeddings",
 87        json={
 88            "model": "MrLight/dse-qwen2-2b-mrl-v1",
 89            "messages": messages,
 90            "encoding_format": "float",
 91        },
 92    )
 93    response.raise_for_status()
 94    response_json = response.json()
 95
 96    print("Embedding output:", response_json["data"][0]["embedding"])
 97
 98
 99if __name__ == '__main__':
100    parser = argparse.ArgumentParser(
101        "Script to call a specified VLM through the API. Make sure to serve "
102        "the model with --task embed before running this.")
103    parser.add_argument("model",
104                        type=str,
105                        choices=["vlm2vec", "dse_qwen2_vl"],
106                        required=True,
107                        help="Which model to call.")
108    args = parser.parse_args()
109
110    if args.model == "vlm2vec":
111        vlm2vec()
112    elif args.model == "dse_qwen2_vl":
113        dse_qwen2_vl({
114            "dtye": "image",
115            "image_url": image_url,
116        })
117        dse_qwen2_vl({
118            "dtype": "text",
119            "content": "What is the weather like today?",
120        })