OpenAI Chat Embedding Client For Multimodal#
Source vllm-project/vllm.
1import argparse
2import base64
3import io
4
5import requests
6from PIL import Image
7
8image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
9
10
11def vlm2vec():
12 response = requests.post(
13 "http://localhost:8000/v1/embeddings",
14 json={
15 "model":
16 "TIGER-Lab/VLM2Vec-Full",
17 "messages": [{
18 "role":
19 "user",
20 "content": [
21 {
22 "type": "image_url",
23 "image_url": {
24 "url": image_url
25 }
26 },
27 {
28 "type": "text",
29 "text": "Represent the given image."
30 },
31 ],
32 }],
33 "encoding_format":
34 "float",
35 },
36 )
37 response.raise_for_status()
38 response_json = response.json()
39
40 print("Embedding output:", response_json["data"][0]["embedding"])
41
42
43def dse_qwen2_vl(inp: dict):
44 # Embedding an Image
45 if inp["dtype"] == "image":
46 messages = [{
47 "role":
48 "user",
49 "content": [{
50 "type": "image_url",
51 "image_url": {
52 "url": inp["image_url"],
53 }
54 }, {
55 "type": "text",
56 "text": "What is shown in this image?"
57 }]
58 }]
59 # Embedding a Text Query
60 else:
61 # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
62 # of the minimum input size
63 buffer = io.BytesIO()
64 image_placeholder = Image.new("RGB", (56, 56))
65 image_placeholder.save(buffer, "png")
66 buffer.seek(0)
67 image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
68 messages = [{
69 "role":
70 "user",
71 "content": [
72 {
73 "type": "image_url",
74 "image_url": {
75 "url": f"data:image/jpeg;base64,{image_placeholder}",
76 }
77 },
78 {
79 "type": "text",
80 "text": f"Query: {inp['content']}"
81 },
82 ]
83 }]
84
85 response = requests.post(
86 "http://localhost:8000/v1/embeddings",
87 json={
88 "model": "MrLight/dse-qwen2-2b-mrl-v1",
89 "messages": messages,
90 "encoding_format": "float",
91 },
92 )
93 response.raise_for_status()
94 response_json = response.json()
95
96 print("Embedding output:", response_json["data"][0]["embedding"])
97
98
99if __name__ == '__main__':
100 parser = argparse.ArgumentParser(
101 "Script to call a specified VLM through the API. Make sure to serve "
102 "the model with --task embed before running this.")
103 parser.add_argument("model",
104 type=str,
105 choices=["vlm2vec", "dse_qwen2_vl"],
106 required=True,
107 help="Which model to call.")
108 args = parser.parse_args()
109
110 if args.model == "vlm2vec":
111 vlm2vec()
112 elif args.model == "dse_qwen2_vl":
113 dse_qwen2_vl({
114 "dtye": "image",
115 "image_url": image_url,
116 })
117 dse_qwen2_vl({
118 "dtype": "text",
119 "content": "What is the weather like today?",
120 })