Florence2 Inference#

Source vllm-project/vllm.

 1'''
 2Demonstrate prompting of text-to-text
 3encoder/decoder models, specifically Florence-2
 4'''
 5# TODO(Isotr0py):
 6# Move to offline_inference_vision_language.py after porting vision backbone
 7from vllm import LLM, SamplingParams
 8
 9dtype = "float"
10
11# Create a Florence-2 encoder/decoder model instance
12llm = LLM(
13    model="microsoft/Florence-2-base",
14    tokenizer="facebook/bart-base",
15    dtype=dtype,
16    trust_remote_code=True,
17)
18
19prompts = [
20    "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
21    "<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
22    "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
23]
24# Create a sampling params object.
25sampling_params = SamplingParams(
26    temperature=0,
27    top_p=1.0,
28    min_tokens=0,
29    max_tokens=20,
30)
31
32# Generate output tokens from the prompts. The output is a list of
33# RequestOutput objects that contain the prompt, generated
34# text, and other information.
35outputs = llm.generate(prompts, sampling_params)
36
37# Print the outputs.
38for output in outputs:
39    prompt = output.prompt
40    encoder_prompt = output.encoder_prompt
41    generated_text = output.outputs[0].text
42    print(f"Encoder prompt: {encoder_prompt!r}, "
43          f"Decoder prompt: {prompt!r}, "
44          f"Generated text: {generated_text!r}")