vllm_omni.tokenizers.mammoth_moda2_tokenizer ¶

ENDOFTEXT `module-attribute` ¶

ENDOFTEXT = '<|endoftext|>'

EXTRAS `module-attribute` ¶

EXTRAS = tuple(EXTRAS)

IMEND `module-attribute` ¶

IMEND = '<|im_end|>'

IMSTART `module-attribute` ¶

IMSTART = '<|im_start|>'

PAT_STR `module-attribute` ¶

PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"

QWEN_SPECIAL_TOKENS `module-attribute` ¶

QWEN_SPECIAL_TOKENS = (
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>",
    "<tool_call>",
    "</tool_call>",
    "<|fim_prefix|>",
    "<|fim_middle|>",
    "<|fim_suffix|>",
    "<|fim_pad|>",
    "<|repo_name|>",
    "<|file_sep|>",
)

SPECIAL_START_ID `module-attribute` ¶

SPECIAL_START_ID = 151643

VOCAB_FILES_NAMES `module-attribute` ¶

VOCAB_FILES_NAMES = {
    "vocab_file": "mammothu.tiktoken",
    "special_tokens_file": "mammothu_vision_tokens.txt",
}

MammothUTokenizer ¶

Bases: PreTrainedTokenizer

MammothU tokenizer.

boi_token `instance-attribute` ¶

boi_token = boi_token

bos_token `instance-attribute` ¶

bos_token = bos_token

decoder `instance-attribute` ¶

decoder = {
    v: k for k, v in (self.mergeable_ranks.items())
}

eod_id `instance-attribute` ¶

eod_id = self.tokenizer.eot_token

eof_token `instance-attribute` ¶

eof_token = eof_token

eoi_token `instance-attribute` ¶

eoi_token = eoi_token

eol_token `instance-attribute` ¶

eol_token = eol_token

eos_token `instance-attribute` ¶

eos_token = eos_token

errors `instance-attribute` ¶

errors = errors

gen_image_placeholder_token `instance-attribute` ¶

gen_image_placeholder_token = '<|gen_placeholder|>'

gen_image_token `instance-attribute` ¶

gen_image_token = '<|gen_image_pad|>'

gen_placeholder_id `property` ¶

gen_placeholder_id

image_content_token `instance-attribute` ¶

image_content_token = '<|image_pad|>'

img_token `instance-attribute` ¶

img_token = img_token

mergeable_ranks `instance-attribute` ¶

mergeable_ranks = _load_tiktoken_bpe(vocab_file)

pad_token `instance-attribute` ¶

pad_token = pad_token

special_tokens `instance-attribute` ¶

special_tokens = {
    token: index for index, token in SPECIAL_TOKENS
}

special_tokens_set `instance-attribute` ¶

special_tokens_set = set(t for _, t in SPECIAL_TOKENS)

tokenizer `instance-attribute` ¶

tokenizer = enc

vision_range `instance-attribute` ¶

vision_range = (
    self.get_vocab()[self.boi_token],
    self.tokenizer.n_vocab - 1,
)

visual_tokens `instance-attribute` ¶

visual_tokens = [
    "<|image_pad|>",
    "<|video_pad|>",
    "<|vision_start|>",
    "<|vision_end|>",
]

visual_tokens_ids `instance-attribute` ¶

visual_tokens_ids = [
    (self.get_vocab()[token])
    for token in (self.visual_tokens)
]

vocab_files_names `class-attribute` `instance-attribute` ¶

vocab_files_names = VOCAB_FILES_NAMES

vocab_size `property` ¶

vocab_size: int

add_special_tokens ¶

add_special_tokens(
    special_tokens_dict: dict[str, str | AddedToken],
) -> int

Add special tokens to the tokenizer and update the special tokens mapping. Only adds tokens that are already in the special_tokens_set.

Parameters:

Name	Type	Description	Default
`special_tokens_dict`	`dict[str, str \| AddedToken]`	dictionary of special tokens to add. The key is the token type and the value is the token to add.	required

Returns:

Type	Description
`int`	Number of tokens added to the vocabulary.

bytes_to_str ¶

bytes_to_str(byte_tokens: dict) -> str

Convert byte tokens to string representation.

Parameters:

Name	Type	Description	Default
`byte_tokens`	`dict`	A dictionary where keys are byte objects and values are integers, or a single byte object.	required

Returns:

Type	Description
`str`	If input is a dictionary, returns a new dictionary with byte keys converted to strings.
`str`	If input is a single byte object, returns the string representation.

convert_tokens_to_ids ¶

convert_tokens_to_ids(
    tokens: bytes | str | list[bytes | str],
) -> list[int]

convert_tokens_to_string ¶

convert_tokens_to_string(tokens: list[bytes | str]) -> str

Converts a sequence of tokens in a single string.

get_vocab ¶

get_vocab() -> dict[bytes | str, int]

save_vocabulary ¶

save_vocabulary(
    save_directory: str, **kwargs
) -> tuple[str]

Save only the vocabulary of the tokenizer (vocabulary).

Returns:

Type	Description
`tuple[str]`	`tuple(str)`: Paths to the files saved.

tokenize ¶

tokenize(
    text: str,
    allowed_special: set | str = "all",
    disallowed_special: Collection | str = (),
    **kwargs,
) -> list[bytes | str]

Converts a string in a sequence of tokens.

Parameters:

Name	Type	Description	Default
`text`	`str`	The sequence to be encoded.	required
`allowed_special`	`Literal["all"]` or `set`	The surface forms of the tokens to be encoded as special tokens in regular texts. Default to "all".	`'all'`
`disallowed_special`	`Literal["all"]` or `Collection`	The surface forms of the tokens that should not be in regular texts and trigger errors. Default to an empty tuple.	`()`
`kwargs`	`additional keyword arguments, optional`	Will be passed to the underlying model specific encode method.	`{}`

Returns:

Type	Description
`list[bytes \| str]`	`list[bytes\|str]`: The list of tokens.

vllm_omni.tokenizers.mammoth_moda2_tokenizer ¶

ENDOFTEXT module-attribute ¶

EXTRAS module-attribute ¶

IMEND module-attribute ¶

IMSTART module-attribute ¶

PAT_STR module-attribute ¶

QWEN_SPECIAL_TOKENS module-attribute ¶

SPECIAL_START_ID module-attribute ¶

VOCAB_FILES_NAMES module-attribute ¶

MammothUTokenizer ¶

boi_token instance-attribute ¶

bos_token instance-attribute ¶

decoder instance-attribute ¶

eod_id instance-attribute ¶

eof_token instance-attribute ¶

eoi_token instance-attribute ¶

eol_token instance-attribute ¶

eos_token instance-attribute ¶

errors instance-attribute ¶

gen_image_placeholder_token instance-attribute ¶

gen_image_token instance-attribute ¶

gen_placeholder_id property ¶

image_content_token instance-attribute ¶

img_token instance-attribute ¶

mergeable_ranks instance-attribute ¶

pad_token instance-attribute ¶

special_tokens instance-attribute ¶

special_tokens_set instance-attribute ¶

tokenizer instance-attribute ¶

vision_range instance-attribute ¶

visual_tokens instance-attribute ¶

visual_tokens_ids instance-attribute ¶

vocab_files_names class-attribute instance-attribute ¶

vocab_size property ¶

add_special_tokens ¶

bytes_to_str ¶

convert_tokens_to_ids ¶

convert_tokens_to_string ¶

get_vocab ¶

save_vocabulary ¶

tokenize ¶

ENDOFTEXT `module-attribute` ¶

EXTRAS `module-attribute` ¶

IMEND `module-attribute` ¶

IMSTART `module-attribute` ¶

PAT_STR `module-attribute` ¶

QWEN_SPECIAL_TOKENS `module-attribute` ¶

SPECIAL_START_ID `module-attribute` ¶

VOCAB_FILES_NAMES `module-attribute` ¶

boi_token `instance-attribute` ¶

bos_token `instance-attribute` ¶

decoder `instance-attribute` ¶

eod_id `instance-attribute` ¶

eof_token `instance-attribute` ¶

eoi_token `instance-attribute` ¶

eol_token `instance-attribute` ¶

eos_token `instance-attribute` ¶

errors `instance-attribute` ¶

gen_image_placeholder_token `instance-attribute` ¶

gen_image_token `instance-attribute` ¶

gen_placeholder_id `property` ¶

image_content_token `instance-attribute` ¶

img_token `instance-attribute` ¶

mergeable_ranks `instance-attribute` ¶

pad_token `instance-attribute` ¶

special_tokens `instance-attribute` ¶

special_tokens_set `instance-attribute` ¶

tokenizer `instance-attribute` ¶

vision_range `instance-attribute` ¶

visual_tokens `instance-attribute` ¶

visual_tokens_ids `instance-attribute` ¶

vocab_files_names `class-attribute` `instance-attribute` ¶

vocab_size `property` ¶