Offline Inference Vision Language Embedding#

Source vllm-project/vllm.

 1from vllm import LLM
 2from vllm.assets.image import ImageAsset
 3
 4image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 5prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
 6
 7# Create an LLM.
 8llm = LLM(
 9    model="TIGER-Lab/VLM2Vec-Full",
10    trust_remote_code=True,
11    max_model_len=4096,
12    max_num_seqs=2,
13    mm_processor_kwargs={"num_crops": 16},
14)
15
16# Generate embedding. The output is a list of EmbeddingRequestOutputs.
17outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
18
19# Print the outputs.
20for output in outputs:
21    print(output.outputs.embedding)  # list of 3072 floats