Offline Inference Vision Language Embedding#
Source vllm-project/vllm.
1from vllm import LLM
2from vllm.assets.image import ImageAsset
3
4image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
5prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
6
7# Create an LLM.
8llm = LLM(
9 model="TIGER-Lab/VLM2Vec-Full",
10 trust_remote_code=True,
11 max_model_len=4096,
12 max_num_seqs=2,
13 mm_processor_kwargs={"num_crops": 16},
14)
15
16# Generate embedding. The output is a list of EmbeddingRequestOutputs.
17outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
18
19# Print the outputs.
20for output in outputs:
21 print(output.outputs.embedding) # list of 3072 floats