vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

Conversation ¶

roles `class-attribute` `instance-attribute` ¶

roles: list[str] = ['User', 'Assistant']

sep `class-attribute` `instance-attribute` ¶

sep: str = '\n\n'

TokenizerEncodeOutput ¶

Bases: BaseOutput

all_image_slices `class-attribute` `instance-attribute` ¶

all_image_slices: list[slice] | None = None

cond_timestep_scatter_index `class-attribute` `instance-attribute` ¶

cond_timestep_scatter_index: Tensor | None = None

cond_vae_image_mask `class-attribute` `instance-attribute` ¶

cond_vae_image_mask: Tensor | None = None

cond_vae_image_slices `class-attribute` `instance-attribute` ¶

cond_vae_image_slices: list[slice] | None = None

cond_vit_image_mask `class-attribute` `instance-attribute` ¶

cond_vit_image_mask: Tensor | None = None

cond_vit_image_slices `class-attribute` `instance-attribute` ¶

cond_vit_image_slices: list[slice] | None = None

gen_image_mask `class-attribute` `instance-attribute` ¶

gen_image_mask: Tensor | None = None

gen_image_slices `class-attribute` `instance-attribute` ¶

gen_image_slices: list[slice] | None = None

gen_timestep_scatter_index `class-attribute` `instance-attribute` ¶

gen_timestep_scatter_index: Tensor | None = None

guidance_scatter_index `class-attribute` `instance-attribute` ¶

guidance_scatter_index: Tensor | None = None

joint_image_slices `class-attribute` `instance-attribute` ¶

joint_image_slices: list[slice] | None = None

real_pos `class-attribute` `instance-attribute` ¶

real_pos: Tensor | None = None

text_mask `class-attribute` `instance-attribute` ¶

text_mask: Tensor | None = None

text_slices `class-attribute` `instance-attribute` ¶

text_slices: list[slice] | None = None

think_recaption_end_pos `class-attribute` `instance-attribute` ¶

think_recaption_end_pos: (
    list[int | None] | list[list[int | None]] | None
) = None

timestep_scatter_index `class-attribute` `instance-attribute` ¶

timestep_scatter_index: Tensor | None = None

tokens `class-attribute` `instance-attribute` ¶

tokens: Tensor | None = None

uncond_cfg_start_pos `class-attribute` `instance-attribute` ¶

uncond_cfg_start_pos: list[int | None] | None = None

TokenizerWrapper ¶

boi_token_id `instance-attribute` ¶

boi_token_id = self.tokenizer.convert_tokens_to_ids("<boi>")

bos_token_id `instance-attribute` ¶

bos_token_id = self.tokenizer.bos_token_id

cfg_token_id `instance-attribute` ¶

cfg_token_id = self.tokenizer.convert_tokens_to_ids("<cfg>")

end_answer_token_id `instance-attribute` ¶

end_answer_token_id = self.tokenizer.convert_tokens_to_ids(
    "</answer>"
)

end_recaption_token_id `instance-attribute` ¶

end_recaption_token_id = (
    self.tokenizer.convert_tokens_to_ids("</recaption>")
)

end_think_token_id `instance-attribute` ¶

end_think_token_id = self.tokenizer.convert_tokens_to_ids(
    "</think>"
)

eoi_token_id `instance-attribute` ¶

eoi_token_id = self.tokenizer.convert_tokens_to_ids("<eoi>")

eos_token_id `instance-attribute` ¶

eos_token_id = self.tokenizer.eos_token_id

img_token_id `instance-attribute` ¶

img_token_id = self.tokenizer.convert_tokens_to_ids("<img>")

pad_token_id `instance-attribute` ¶

pad_token_id = self.tokenizer.pad_token_id

ratio_token_offset `instance-attribute` ¶

ratio_token_offset = self.tokenizer.convert_tokens_to_ids(
    "<img_ratio_0>"
)

special_token_map `instance-attribute` ¶

special_token_map = self.tokenizer.added_tokens_encoder

tokenizer `instance-attribute` ¶

tokenizer = AutoTokenizer.from_pretrained(tokenizer)

apply_chat_template ¶

apply_chat_template(
    batch_prompt: list[str] | None = None,
    batch_message_list: list[list[dict[str, Any]]]
    | None = None,
    mode: str = "gen_text",
    batch_gen_image_info: list[ImageInfo] | None = None,
    batch_cond_image_info: list[JointImageInfo]
    | list[list[JointImageInfo]]
    | None = None,
    batch_system_prompt: list[str] | None = None,
    batch_cot_text: list[str] | None = None,
    max_length: int | None = None,
    bot_task: str = "auto",
    image_base_size: int = 1024,
    sequence_template: str = "pretrain",
    cfg_factor: int = 1,
    add_assistant_prefix: bool | None = None,
    drop_think: bool = False,
) -> dict[str, Any]

apply_general_template ¶

apply_general_template(
    message_list,
    max_length=None,
    add_assistant_prefix=False,
    answer="auto",
    bot_task="auto",
    sequence_template="instruct",
    uncond_p=0.0,
    cfg_factor=1,
    batchify=False,
    image_base_size=1024,
    drop_think=False,
)

batch_gen_infer ¶

batch_gen_infer(
    infer_fn,
    prompt_list: list,
    negative_prompt_list: list | None = None,
    infer_fn_kwargs_list: list[dict[str, int]]
    | None = None,
    do_classifier_free_guidance=False,
    condition_repeat_times: int = 1,
    uncondition_repeat_times: int = 1,
)

Batch inference for the AR-like model training of the text-to-image/instruction tuning tasks.

Parameters:

Name	Type	Description	Default
`infer_fn`	`callable`	Inference function to encode the prompt.	required
`prompt_list`	`list`	List of prompts. Each element can be a single prompt or a list of prompts passed to the infer_fn.	required
`negative_prompt_list`	`list`, optional	List of negative prompts. Only used when do_classifier_free_guidance is True. If None, will use token sequence as negative prompt.	`None`
`infer_fn_kwargs_list`	`List[Dict[str, int]]`, optional	List of keyword arguments for the infer_fn.	`None`
`do_classifier_free_guidance`	`bool`	Whether to do classifier-free guidance.	`False`
`condition_repeat_times`	`int`	Support multi-condition.	`1`
`uncondition_repeat_times`	`int`	Support multi-uncondition.	`1`

decode ¶

decode(*args, **kwargs)

encode ¶

encode(*args, **kwargs)

encode_general ¶

encode_general(
    sections: list[dict[str, Any]] | None = None,
    max_token_length: int | None = None,
    add_eos="auto",
    use_text_mask=True,
    add_pad="auto",
    add_bos=True,
    drop_last="auto",
) -> TokenizerEncodeOutput

General encode function to encode a sequence with multiple sections of text and images. Each section is a dict with a type key and other keys depending on the type.

Supported section types:

text: dict with keys:
- text (str or List[int]): Text to be encoded. Either text or tokens should be provided.
- tokens (List[int]): Pre-encoded text tokens. Either text or tokens should be provided.
- uncond_enabled (bool): Whether to enable uncondition for this text section.
- uncond_p (float): Probability to drop the text section for uncondition.
- max_length (int): Maximum length of the text section.
- ignore (bool): Whether to ignore this text section in the text mask.
- start_offset (int): Start offset of the text mask.
- end_offset (int): End offset of the text mask.
gen_image: dict with keys:
- token_length (int): Number of image tokens.
- add_timestep_token (bool): Whether to add timestep token before the image tokens.
- add_guidance_token (bool): Whether to add guidance token before the image tokens.
- use_front_boi_token (bool): Whether to put the token.
- add_image_shape_token (bool): Whether to add image shape token before the image tokens.
- base_size (int): Base size of the image.
- ratio_idx (int): Ratio index of the image.
joint_image: dict with keys:
- token_length (List[int]): Number of image tokens for the two images.
- add_timestep_token (bool): Whether to add timestep token before the image tokens.
- use_front_boi_token (bool): Whether to put the token.
- add_image_shape_token (bool): Whether to add image shape token before the image tokens.
- base_size (int): Base size of the image.
- ratio_idx (int): Ratio index of the image.

Parameters:

Name	Type	Description	Default
`sections`	`List[Dict[str, Any]]`	List of sections to be encoded.	`None`
`max_token_length`	`int`	Maximum length of the encoded token sequence.	`None`
`add_eos`	`bool` or `'auto'`	Whether to add eos token at the end of the sequence.	`'auto'`
`use_text_mask`	`bool`	Whether to generate text mask.	`True`
`add_pad`	`bool` or `'auto'`	Whether to add padding tokens to the sequence. If True and total_length is not reached, add padding tokens.	`'auto'`
`add_bos`	`bool`	Whether to add bos token at the beginning of the sequence.	`True`
`drop_last`	`bool` or `'auto'`	If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is in the middle of the image tokens, an error will raised. If True, drop last tokens exceeding the total_length. If cut point is in the middle of the image tokens, all the successive image tokens will be dropped. If False, keep the last tokens exceeding the total_length, even if the total_length is reached.	`'auto'`

Returns:

Type	Description
`TokenizerEncodeOutput`	`TokenizerEncodeOutput`: Encoded token sequence and extra information.

encode_sequence ¶

encode_sequence(
    template: str,
    token_source: dict[str, list],
    total_length=None,
    add_timestep_token=False,
    add_guidance_token=False,
    last_key_only_prefix=False,
    add_eos=True,
    use_front_boi_token=True,
    add_pad=True,
    add_bos=True,
    drop_last: str | bool = "auto",
    add_image_shape_token=False,
) -> tuple[list, dict[str, list]]

Encode a sequence based on the template (e.g., text-image for t2i, text-image-image for instruction tuning) and token source.

Parameters:

Name	Type	Description	Default
`template`	`str`	Template of the sequence. E.g., "text-gen_image" means the sequence is composed of text and an image. "text-text-gen_image" means the sequence is composed of two sections of text and an image.	required
`token_source`	`Dict[str, List]`	Token source for each key in the template, in order. - text: List[Dict]. - gen_image: List[Dict]. - joint_image: List[Dict].	required
`total_length`	`int`	Total length of the encoded sequence, include padding tokens.	`None`
`add_timestep_token`	`bool`	Whether to add timestep token before the image tokens. (Right after the tokens)	`False`
`add_guidance_token`	`bool`	Whether to add guidance token before the image tokens.	`False`
`last_key_only_prefix`	`bool`	Whether to only use the modal prefix in the last key.	`False`
`add_eos`	`bool` or `'auto'`	Whether to add eos token at the end of the sequence. If True, always add eos token. If 'auto', add eos token only when the total_length is not reached and the last token is not .	`True`
`use_front_boi_token`	`bool`	Whether to put the token at the front of iw, ih and timestep tokens.	`True`
`add_pad`	`bool` or `'auto'`	Whether to add padding tokens to the sequence.	`True`
`add_bos`	`bool`	Whether to add bos token at the beginning of the sequence.	`True`
`drop_last`	`bool` or `'auto'`	If auto, drop last tokens exceeding the total_length if the total_length is provided. If cut point is in the middle of the image tokens, an error will raised. If True, drop last tokens exceeding the total_length. If False, keep the last tokens exceeding the total_length, even if the total_length is reached.	`'auto'`
`add_image_shape_token`	`bool`	Whether to add image shape token before the image tokens. (Right before the token)	`False`

Returns:

Type	Description
`tuple[list, dict[str, list]]`	`tuple[list, dict]`: A tuple containing: - token_seq (`list`): Encoded token sequence. - extra_token_pos (`dict`): Positions of extra tokens.

encode_text ¶

encode_text(
    *texts,
    uncond_enabled: bool | list[bool] | None = None,
    uncond_p: float | None = None,
    max_length: int | None = None,
    pad: str | None = None,
    return_lengths: bool = False,
) -> list[Tensor]

Encode text and image for AR-like model training of the text-to-image/instruction tuning tasks. Support encode multiple texts at once. Each text can be separately conditioned or unconditioned based on the uncond_flags and a uniform uncond_p. token is always prepended to the text tokens.

Parameters:

Name	Type	Description	Default
`texts`	`str` or `List[str]`	List of texts to be encoded.	`()`
`uncond_enabled`	`bool` or `List[bool]`	List of flags to indicate whether the text should be unconditioned. If False, the text will never be unconditioned. If True, the text will be unconditioned with uncond_p.	`None`
`uncond_p`	`float`	Probability to the unconditional text. Only works when uncond_enabled is True.	`None`
`max_length`	`int`	Maximum length of the encoded text.	`None`
`pad`	`str`, optional	Padding method. Can be 'left' or 'right'.	`None`
`return_lengths`	`bool`	Whether to return the length of each encoded text.	`False`

Returns:

Type	Description
`list[Tensor]`	`tuple[torch.Tensor, List[int]]` or `torch.Tensor`: If `return_lengths` is True, returns a tuple of (encoded_tokens, lengths). If `return_lengths` is False, returns only the encoded_tokens.

get_cot_sections ¶

get_cot_sections(
    cot_text,
    uncond_kwargs,
    cot_max_length=None,
    drop_think=False,
)

pad ¶

pad(tensor_list, dim=0, pad_val=None)

parse_extra_token_pos `staticmethod` ¶

parse_extra_token_pos(
    extra_token_pos, prefix, tokens, rng=None
)

vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer ¶

logger module-attribute ¶

Conversation ¶

roles class-attribute instance-attribute ¶

sep class-attribute instance-attribute ¶

TokenizerEncodeOutput ¶

all_image_slices class-attribute instance-attribute ¶

cond_timestep_scatter_index class-attribute instance-attribute ¶

cond_vae_image_mask class-attribute instance-attribute ¶

cond_vae_image_slices class-attribute instance-attribute ¶

cond_vit_image_mask class-attribute instance-attribute ¶

cond_vit_image_slices class-attribute instance-attribute ¶

gen_image_mask class-attribute instance-attribute ¶

gen_image_slices class-attribute instance-attribute ¶

gen_timestep_scatter_index class-attribute instance-attribute ¶

guidance_scatter_index class-attribute instance-attribute ¶

joint_image_slices class-attribute instance-attribute ¶

real_pos class-attribute instance-attribute ¶

text_mask class-attribute instance-attribute ¶

text_slices class-attribute instance-attribute ¶

think_recaption_end_pos class-attribute instance-attribute ¶

timestep_scatter_index class-attribute instance-attribute ¶

tokens class-attribute instance-attribute ¶

uncond_cfg_start_pos class-attribute instance-attribute ¶

TokenizerWrapper ¶

boi_token_id instance-attribute ¶

bos_token_id instance-attribute ¶

cfg_token_id instance-attribute ¶

end_answer_token_id instance-attribute ¶

end_recaption_token_id instance-attribute ¶

end_think_token_id instance-attribute ¶

eoi_token_id instance-attribute ¶

eos_token_id instance-attribute ¶

img_token_id instance-attribute ¶

pad_token_id instance-attribute ¶

ratio_token_offset instance-attribute ¶

special_token_map instance-attribute ¶

tokenizer instance-attribute ¶

apply_chat_template ¶

apply_general_template ¶

batch_gen_infer ¶

decode ¶

encode ¶

encode_general ¶

encode_sequence ¶

encode_text ¶

get_cot_sections ¶

pad ¶

parse_extra_token_pos staticmethod ¶

logger `module-attribute` ¶

roles `class-attribute` `instance-attribute` ¶

sep `class-attribute` `instance-attribute` ¶

all_image_slices `class-attribute` `instance-attribute` ¶

cond_timestep_scatter_index `class-attribute` `instance-attribute` ¶

cond_vae_image_mask `class-attribute` `instance-attribute` ¶

cond_vae_image_slices `class-attribute` `instance-attribute` ¶

cond_vit_image_mask `class-attribute` `instance-attribute` ¶

cond_vit_image_slices `class-attribute` `instance-attribute` ¶

gen_image_mask `class-attribute` `instance-attribute` ¶

gen_image_slices `class-attribute` `instance-attribute` ¶

gen_timestep_scatter_index `class-attribute` `instance-attribute` ¶

guidance_scatter_index `class-attribute` `instance-attribute` ¶

joint_image_slices `class-attribute` `instance-attribute` ¶

real_pos `class-attribute` `instance-attribute` ¶

text_mask `class-attribute` `instance-attribute` ¶

text_slices `class-attribute` `instance-attribute` ¶

think_recaption_end_pos `class-attribute` `instance-attribute` ¶

timestep_scatter_index `class-attribute` `instance-attribute` ¶

tokens `class-attribute` `instance-attribute` ¶

uncond_cfg_start_pos `class-attribute` `instance-attribute` ¶

boi_token_id `instance-attribute` ¶

bos_token_id `instance-attribute` ¶

cfg_token_id `instance-attribute` ¶

end_answer_token_id `instance-attribute` ¶

end_recaption_token_id `instance-attribute` ¶

end_think_token_id `instance-attribute` ¶

eoi_token_id `instance-attribute` ¶

eos_token_id `instance-attribute` ¶

img_token_id `instance-attribute` ¶

pad_token_id `instance-attribute` ¶

ratio_token_offset `instance-attribute` ¶

special_token_map `instance-attribute` ¶

tokenizer `instance-attribute` ¶

parse_extra_token_pos `staticmethod` ¶