Skip to content

vllm_omni.diffusion.models.ovis_image.pipeline_ovis_image

logger module-attribute

logger = init_logger(__name__)

OvisImagePipeline

Bases: Module, CFGParallelMixin, DiffusionPipelineProfilerMixin

current_timestep property

current_timestep

default_sample_size instance-attribute

default_sample_size = 128

guidance_scale property

guidance_scale

interrupt property

interrupt

joint_attention_kwargs property

joint_attention_kwargs

num_timesteps property

num_timesteps

od_config instance-attribute

od_config = od_config

scheduler instance-attribute

scheduler = from_pretrained(
    model,
    subfolder="scheduler",
    local_files_only=local_files_only,
)

system_prompt instance-attribute

system_prompt = "Describe the image by detailing the color, quantity, text, shape, size, texture, spatial\n        relationships of the objects and background: "

text_encoder instance-attribute

text_encoder = from_pretrained_with_prefetch(
    from_pretrained,
    model,
    subfolder="text_encoder",
    prefetch_list=ovis_subfolders,
    local_files_only=local_files_only,
    torch_dtype=dtype,
)

tokenizer instance-attribute

tokenizer = from_pretrained(
    model,
    subfolder="tokenizer",
    local_files_only=local_files_only,
)

tokenizer_max_length instance-attribute

tokenizer_max_length = 256 + user_prompt_begin_id

transformer instance-attribute

transformer = OvisImageTransformer2DModel(
    od_config=od_config
)

user_prompt_begin_id instance-attribute

user_prompt_begin_id = 28

vae instance-attribute

vae = to(_execution_device)

vae_scale_factor instance-attribute

vae_scale_factor = (
    2 ** (len(block_out_channels) - 1)
    if getattr(self, "vae", None)
    else 8
)

weights_sources instance-attribute

weights_sources = [
    ComponentSource(
        model_or_path=model,
        subfolder="transformer",
        revision=None,
        prefix="transformer.",
        fall_back_to_pt=True,
    )
]

check_inputs

check_inputs(
    prompt,
    height,
    width,
    negative_prompt=None,
    prompt_embeds=None,
    negative_prompt_embeds=None,
    callback_on_step_end_tensor_inputs=None,
    max_sequence_length=None,
)

diffuse

diffuse(
    latents: Tensor,
    timesteps: Tensor,
    prompt_embeds: Tensor,
    negative_prompt_embeds: Tensor,
    text_ids: Tensor,
    negative_text_ids: Tensor,
    latent_image_ids: Tensor,
    do_true_cfg: bool,
    guidance_scale: float,
    cfg_normalize: bool = False,
) -> Tensor

Diffusion loop with optional classifier-free guidance.

Parameters:

Name Type Description Default
latents Tensor

Noise latents to denoise

required
timesteps Tensor

Diffusion timesteps

required
prompt_embeds Tensor

Positive prompt embeddings

required
negative_prompt_embeds Tensor

Negative prompt embeddings

required
text_ids Tensor

Position IDs for positive text

required
negative_text_ids Tensor

Position IDs for negative text

required
latent_image_ids Tensor

Position IDs for image latents

required
do_true_cfg bool

Whether to apply CFG

required
guidance_scale float

CFG scale factor

required
cfg_normalize bool

Whether to normalize CFG output (default: False)

False

Returns:

Type Description
Tensor

Denoised latents

encode_prompt

encode_prompt(
    prompt: str | list[str],
    device: device | None = None,
    num_images_per_prompt: int = 1,
    prompt_embeds: FloatTensor | None = None,
)

Parameters:

Name Type Description Default
prompt `str` or `list[str]`, *optional*

prompt to be encoded

required
device device | None

(torch.device, optional): torch.device

None
num_images_per_prompt int

(int): number of images that should be generated per prompt

1
prompt_embeds FloatTensor | None

(torch.FloatTensor, optional): Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated from prompt input argument.

None

forward

forward(
    req: OmniDiffusionRequest,
    prompt: str | list[str] | None = None,
    negative_prompt: str | list[str] | None = None,
    guidance_scale: float = 5.0,
    height: int | None = None,
    width: int | None = None,
    num_inference_steps: int = 50,
    sigmas: list[float] | None = None,
    num_images_per_prompt: int | None = 1,
    generator: Generator | list[Generator] | None = None,
    latents: FloatTensor | None = None,
    prompt_embeds: FloatTensor | None = None,
    negative_prompt_embeds: FloatTensor | None = None,
    output_type: str | None = "pil",
    return_dict: bool = True,
    joint_attention_kwargs: dict[str, Any] | None = None,
    callback_on_step_end: Callable[[int, int, dict], None]
    | None = None,
    callback_on_step_end_tensor_inputs: list[str] = [
        "latents"
    ],
    max_sequence_length: int = 256,
) -> DiffusionOutput

Function invoked when calling the pipeline for generation.

Parameters:

Name Type Description Default
prompt `str` or `list[str]`, *optional*

The prompt or prompts to guide the image generation. If not defined, one has to pass prompt_embeds. instead.

None
negative_prompt `str` or `list[str]`, *optional*

The prompt or prompts not to guide the image generation. If not defined, one has to pass negative_prompt_embeds instead. Ignored when not using guidance (i.e., ignored if guidance_scale is not greater than 1).

None
guidance_scale `float`, *optional*, defaults to 1.0

True classifier-free guidance (guidance scale) is enabled when guidance_scale > 1 and negative_prompt is provided.

5.0
height `int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor

The height in pixels of the generated image. This is set to 1024 by default for the best results.

None
width `int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor

The width in pixels of the generated image. This is set to 1024 by default for the best results.

None
num_inference_steps `int`, *optional*, defaults to 50

The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.

50
sigmas `list[float]`, *optional*

Custom sigmas to use for the denoising process with schedulers which support a sigmas argument in their set_timesteps method. If not defined, the default behavior when num_inference_steps is passed will be used.

None
num_images_per_prompt `int`, *optional*, defaults to 1

The number of images to generate per prompt.

1
generator `torch.Generator` or `list[torch.Generator]`, *optional*

One or a list of torch generator(s) to make generation deterministic.

None
latents `torch.FloatTensor`, *optional*

Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will be generated by sampling using the supplied random generator.

None
prompt_embeds `torch.FloatTensor`, *optional*

Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, text embeddings will be generated from prompt input argument.

None
negative_prompt_embeds `torch.FloatTensor`, *optional*

Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not provided, negative_prompt_embeds will be generated from negative_prompt input argument.

None
output_type `str`, *optional*, defaults to `"pil"`

The output format of the generate image. Choose between PIL: PIL.Image.Image or np.array.

'pil'
return_dict `bool`, *optional*, defaults to `True`

Whether or not to return a [~pipelines.flux.FluxPipelineOutput] instead of a plain tuple.

True
joint_attention_kwargs `dict`, *optional*

A kwargs dictionary that if specified is passed along to the AttentionProcessor as defined under self.processor in diffusers.models.attention_processor.

None
callback_on_step_end `Callable`, *optional*

A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: dict). callback_kwargs will include a list of all tensors as specified by callback_on_step_end_tensor_inputs.

None
callback_on_step_end_tensor_inputs `list`, *optional*

The list of tensor inputs for the callback_on_step_end function. The tensors specified in the list will be passed as callback_kwargs argument. You will only be able to include variables listed in the ._callback_tensor_inputs attribute of your pipeline class.

['latents']
max_sequence_length `int` defaults to 512

Maximum sequence length to use with the prompt.

256

Examples:

Returns:

Type Description
DiffusionOutput

[~pipelines.ovis_image.OvisImagePipelineOutput] or tuple:

DiffusionOutput

[~pipelines.ovis_image.OvisImagePipelineOutput] if return_dict is True, otherwise a tuple. When

DiffusionOutput

returning a tuple, the first element is a list with the generated images.

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

prepare_latents

prepare_latents(
    batch_size,
    num_channel_latents,
    height,
    width,
    dtype,
    device,
    generator,
    latents=None,
)

prepare_timesteps

prepare_timesteps(
    num_inference_steps, sigmas, image_seq_len
)

calculate_shift

calculate_shift(
    image_seq_len,
    base_seq_len: int = 256,
    max_seq_len: int = 4096,
    base_shift: float = 0.5,
    max_shift: float = 1.15,
)

get_ovis_image_post_process_func

get_ovis_image_post_process_func(
    od_config: OmniDiffusionConfig,
)

retrieve_timesteps

retrieve_timesteps(
    scheduler,
    num_inference_steps: int | None = None,
    device: str | device | None = None,
    timesteps: list[int] | None = None,
    sigmas: list[float] | None = None,
    **kwargs,
) -> tuple[Tensor, int]

Calls the scheduler's set_timesteps method and retrieves timetemps from the scheduler after the call. Handles custom timeteps. Any kwargs will be supplied to scheduler.set_timeteps.

Parameters:

Name Type Description Default
scheduler `SchedulerMixin`

The scheduler to get timesteps from.

required
num_inference_steps `int`, *optional*

The number of diffusion steps used when generating samples with a pre-trained model. If used, timesteps must be None.

None
device `str` or `torch.device`, *optional*

The device to which the timesteps should be moved to. If None, the timesteps are not moved.

None
timesteps `list[int]`, *optional*

Custom timesteps used to override the timestep spacing strategy of the scheduler. If timesteps is passed, num_inference_steps and sigmas must be None.

None
sigmas `list[float]`, *optional*

Custom sigmas used to override the timestep spacing strategy of the scheduler. If sigmas is passed, num_inference_steps and timesteps must be None.

None

Returns:

Type Description
Tensor

Tuple[torch.Tensor, int]: A tuple where the first element is the timestep schedule from the scheduler and the

int

second element is the number of inference steps.