Skip to content

vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer

logger module-attribute

logger = init_logger(__name__)

Conversation

roles class-attribute instance-attribute

roles: list[str] = ['User', 'Assistant']

sep class-attribute instance-attribute

sep: str = '\n\n'

TokenizerEncodeOutput

Bases: BaseOutput

all_image_slices class-attribute instance-attribute

all_image_slices: list[slice] | None = None

cond_timestep_scatter_index class-attribute instance-attribute

cond_timestep_scatter_index: Tensor | None = None

cond_vae_image_mask class-attribute instance-attribute

cond_vae_image_mask: Tensor | None = None

cond_vae_image_slices class-attribute instance-attribute

cond_vae_image_slices: list[slice] | None = None

cond_vit_image_mask class-attribute instance-attribute

cond_vit_image_mask: Tensor | None = None

cond_vit_image_slices class-attribute instance-attribute

cond_vit_image_slices: list[slice] | None = None

gen_image_mask class-attribute instance-attribute

gen_image_mask: Tensor | None = None

gen_image_slices class-attribute instance-attribute

gen_image_slices: list[slice] | None = None

gen_timestep_scatter_index class-attribute instance-attribute

gen_timestep_scatter_index: Tensor | None = None

guidance_scatter_index class-attribute instance-attribute

guidance_scatter_index: Tensor | None = None

joint_image_slices class-attribute instance-attribute

joint_image_slices: list[slice] | None = None

real_pos class-attribute instance-attribute

real_pos: Tensor | None = None

text_mask class-attribute instance-attribute

text_mask: Tensor | None = None

text_slices class-attribute instance-attribute

text_slices: list[slice] | None = None

think_recaption_end_pos class-attribute instance-attribute

think_recaption_end_pos: (
    list[int | None] | list[list[int | None]] | None
) = None

timestep_scatter_index class-attribute instance-attribute

timestep_scatter_index: Tensor | None = None

tokens class-attribute instance-attribute

tokens: Tensor | None = None

uncond_cfg_start_pos class-attribute instance-attribute

uncond_cfg_start_pos: list[int | None] | None = None

TokenizerWrapper

boi_token_id instance-attribute

boi_token_id = convert_tokens_to_ids('<boi>')

bos_token_id instance-attribute

bos_token_id = bos_token_id

cfg_token_id instance-attribute

cfg_token_id = convert_tokens_to_ids('<cfg>')

end_answer_token_id instance-attribute

end_answer_token_id = convert_tokens_to_ids('</answer>')

end_recaption_token_id instance-attribute

end_recaption_token_id = convert_tokens_to_ids(
    "</recaption>"
)

end_think_token_id instance-attribute

end_think_token_id = convert_tokens_to_ids('</think>')

eoi_token_id instance-attribute

eoi_token_id = convert_tokens_to_ids('<eoi>')

eos_token_id instance-attribute

eos_token_id = eos_token_id

img_token_id instance-attribute

img_token_id = convert_tokens_to_ids('<img>')

pad_token_id instance-attribute

pad_token_id = pad_token_id

ratio_token_offset instance-attribute

ratio_token_offset = convert_tokens_to_ids('<img_ratio_0>')

special_token_map instance-attribute

special_token_map = added_tokens_encoder

tokenizer instance-attribute

tokenizer = from_pretrained(tokenizer)

apply_chat_template

apply_chat_template(
    batch_prompt: list[str] | None = None,
    batch_message_list: list[list[dict[str, Any]]]
    | None = None,
    mode: str = "gen_text",
    batch_gen_image_info: list[ImageInfo] | None = None,
    batch_cond_image_info: list[JointImageInfo]
    | list[list[JointImageInfo]]
    | None = None,
    batch_system_prompt: list[str] | None = None,
    batch_cot_text: list[str] | None = None,
    max_length: int | None = None,
    bot_task: str = "auto",
    image_base_size: int = 1024,
    sequence_template: str = "pretrain",
    cfg_factor: int = 1,
    add_assistant_prefix: bool | None = None,
    drop_think: bool = False,
) -> dict[str, Any]

apply_general_template

apply_general_template(
    message_list,
    max_length=None,
    add_assistant_prefix=False,
    answer="auto",
    bot_task="auto",
    sequence_template="instruct",
    uncond_p=0.0,
    cfg_factor=1,
    batchify=False,
    image_base_size=1024,
    drop_think=False,
)

batch_gen_infer

batch_gen_infer(
    infer_fn,
    prompt_list: list,
    negative_prompt_list: list | None = None,
    infer_fn_kwargs_list: list[dict[str, int]]
    | None = None,
    do_classifier_free_guidance=False,
    condition_repeat_times: int = 1,
    uncondition_repeat_times: int = 1,
)

Batch inference for the AR-like model training of the text-to-image/instruction tuning tasks.

Parameters:

Name Type Description Default
infer_fn `callable`

Inference function to encode the prompt.

required
prompt_list `list`

List of prompts. Each element can be a single prompt or a list of prompts passed to the infer_fn.

required
negative_prompt_list `list`, *optional*

List of negative prompts. Only used when do_classifier_free_guidance is True. If None, will use token sequence as negative prompt.

None
infer_fn_kwargs_list `List[Dict[str, int]]`, *optional*

List of keyword arguments for the infer_fn.

None
do_classifier_free_guidance `bool`

Whether to do classifier-free guidance.

False
condition_repeat_times `int`

Support multi-condition.

1
uncondition_repeat_times `int`

Support multi-uncondition.

1

decode

decode(*args, **kwargs)

encode

encode(*args, **kwargs)

encode_general

encode_general(
    sections: list[dict[str, Any]] | None = None,
    max_token_length: int | None = None,
    add_eos="auto",
    use_text_mask=True,
    add_pad="auto",
    add_bos=True,
    drop_last="auto",
) -> TokenizerEncodeOutput

General encode function to encode a sequence with multiple sections of text and images. Each section is a dict with a type key and other keys depending on the type.

Supported section types:

  • text: dict with keys:

    • text (str or List[int]): Text to be encoded. Either text or tokens should be provided.
    • tokens (List[int]): Pre-encoded text tokens. Either text or tokens should be provided.
    • uncond_enabled (bool): Whether to enable uncondition for this text section.
    • uncond_p (float): Probability to drop the text section for uncondition.
    • max_length (int): Maximum length of the text section.
    • ignore (bool): Whether to ignore this text section in the text mask.
    • start_offset (int): Start offset of the text mask.
    • end_offset (int): End offset of the text mask.
  • gen_image: dict with keys:

    • token_length (int): Number of image tokens.
    • add_timestep_token (bool): Whether to add timestep token before the image tokens.
    • add_guidance_token (bool): Whether to add guidance token before the image tokens.
    • use_front_boi_token (bool): Whether to put the token.
    • add_image_shape_token (bool): Whether to add image shape token before the image tokens.
    • base_size (int): Base size of the image.
    • ratio_idx (int): Ratio index of the image.
  • joint_image: dict with keys:

    • token_length (List[int]): Number of image tokens for the two images.
    • add_timestep_token (bool): Whether to add timestep token before the image tokens.
    • use_front_boi_token (bool): Whether to put the token.
    • add_image_shape_token (bool): Whether to add image shape token before the image tokens.
    • base_size (int): Base size of the image.
    • ratio_idx (int): Ratio index of the image.

Parameters:

Name Type Description Default
sections `List[Dict[str, Any]]`

List of sections to be encoded.

None
max_token_length `int`

Maximum length of the encoded token sequence.

None
add_eos `bool` or `'auto'`

Whether to add eos token at the end of the sequence.

'auto'
use_text_mask `bool`

Whether to generate text mask.

True
add_pad `bool` or `'auto'`

Whether to add padding tokens to the sequence. If True and total_length is not reached, add padding tokens.

'auto'
add_bos `bool`

Whether to add bos token at the beginning of the sequence.

True
drop_last `bool` or `'auto'`
  • If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is in the middle of the image tokens, an error will raised.
  • If True, drop last tokens exceeding the total_length. If cut point is in the middle of the image tokens, all the successive image tokens will be dropped.
  • If False, keep the last tokens exceeding the total_length, even if the total_length is reached.
'auto'

Returns:

Type Description
TokenizerEncodeOutput

TokenizerEncodeOutput: Encoded token sequence and extra information.

encode_sequence

encode_sequence(
    template: str,
    token_source: dict[str, list],
    total_length=None,
    add_timestep_token=False,
    add_guidance_token=False,
    last_key_only_prefix=False,
    add_eos=True,
    use_front_boi_token=True,
    add_pad=True,
    add_bos=True,
    drop_last: str | bool = "auto",
    add_image_shape_token=False,
) -> tuple[list, dict[str, list]]

Encode a sequence based on the template (e.g., text-image for t2i, text-image-image for instruction tuning) and token source.

Parameters:

Name Type Description Default
template `str`

Template of the sequence. E.g., "text-gen_image" means the sequence is composed of text and an image. "text-text-gen_image" means the sequence is composed of two sections of text and an image.

required
token_source `Dict[str, List]`

Token source for each key in the template, in order. - text: List[Dict]. - gen_image: List[Dict]. - joint_image: List[Dict].

required
total_length `int`

Total length of the encoded sequence, include padding tokens.

None
add_timestep_token `bool`

Whether to add timestep token before the image tokens. (Right after the tokens)

False
add_guidance_token `bool`

Whether to add guidance token before the image tokens.

False
last_key_only_prefix `bool`

Whether to only use the modal prefix in the last key.

False
add_eos `bool` or `'auto'`

Whether to add eos token at the end of the sequence. If True, always add eos token. If 'auto', add eos token only when the total_length is not reached and the last token is not .

True
use_front_boi_token `bool`

Whether to put the token at the front of iw, ih and timestep tokens.

True
add_pad `bool` or `'auto'`

Whether to add padding tokens to the sequence.

True
add_bos `bool`

Whether to add bos token at the beginning of the sequence.

True
drop_last `bool` or `'auto'`
  • If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is in the middle of the image tokens, an error will raised.
  • If True, drop last tokens exceeding the total_length.
  • If False, keep the last tokens exceeding the total_length, even if the total_length is reached.
'auto'
add_image_shape_token `bool`

Whether to add image shape token before the image tokens. (Right before the token)

False

Returns:

Type Description
tuple[list, dict[str, list]]

tuple[list, dict]: A tuple containing: - token_seq (list): Encoded token sequence. - extra_token_pos (dict): Positions of extra tokens.

encode_text

encode_text(
    *texts,
    uncond_enabled: bool | list[bool] | None = None,
    uncond_p: float | None = None,
    max_length: int | None = None,
    pad: str | None = None,
    return_lengths: bool = False,
) -> list[Tensor]

Encode text and image for AR-like model training of the text-to-image/instruction tuning tasks. Support encode multiple texts at once. Each text can be separately conditioned or unconditioned based on the uncond_flags and a uniform uncond_p. token is always prepended to the text tokens.

Parameters:

Name Type Description Default
texts `str` or `List[str]`

List of texts to be encoded.

()
uncond_enabled `bool` or `List[bool]`

List of flags to indicate whether the text should be unconditioned. If False, the text will never be unconditioned. If True, the text will be unconditioned with uncond_p.

None
uncond_p `float`

Probability to the unconditional text. Only works when uncond_enabled is True.

None
max_length `int`

Maximum length of the encoded text.

None
pad `str`, *optional*

Padding method. Can be 'left' or 'right'.

None
return_lengths `bool`

Whether to return the length of each encoded text.

False

Returns:

Type Description
list[Tensor]

tuple[torch.Tensor, List[int]] or torch.Tensor: If return_lengths is True, returns a tuple of (encoded_tokens, lengths). If return_lengths is False, returns only the encoded_tokens.

get_cot_sections

get_cot_sections(
    cot_text,
    uncond_kwargs,
    cot_max_length=None,
    drop_think=False,
)

pad

pad(tensor_list, dim=0, pad_val=None)

parse_extra_token_pos staticmethod

parse_extra_token_pos(
    extra_token_pos, prefix, tokens, rng=None
)