Skip to content

vllm_omni.model_executor.models.indextts2.utils.front

TextTokenizer

bos_token property

bos_token

bos_token_id property

bos_token_id

eos_token property

eos_token

eos_token_id property

eos_token_id

pad_token property

pad_token

pad_token_id property

pad_token_id

pre_tokenizers instance-attribute

pre_tokenizers = [tokenize_by_CJK_char]

punctuation_marks_tokens class-attribute instance-attribute

punctuation_marks_tokens = [
    ".",
    "!",
    "?",
    "▁.",
    "▁?",
    "▁...",
]

sp_model instance-attribute

sp_model = SentencePieceProcessor(
    model_file=self.vocab_file
)

special_tokens_map property

special_tokens_map

unk_token property

unk_token

unk_token_id property

unk_token_id

vocab_file instance-attribute

vocab_file = vocab_file

vocab_size property

vocab_size

convert_ids_to_tokens

convert_ids_to_tokens(ids: int) -> str
convert_ids_to_tokens(ids: list[int]) -> list[str]
convert_ids_to_tokens(ids: list[int] | int)

convert_tokens_to_ids

convert_tokens_to_ids(tokens: list[str] | str) -> list[int]

decode

decode(ids: list[int] | int, do_lower_case=False, **kwargs)

encode

encode(text: str, **kwargs)

get_vocab

get_vocab()

split_segments

split_segments(
    tokenized: list[str],
    max_text_tokens_per_segment=120,
    quick_streaming_tokens=0,
) -> list[list[str]]

split_segments_by_token staticmethod

split_segments_by_token(
    tokenized_str: list[str],
    split_tokens: list[str],
    max_text_tokens_per_segment: int,
    quick_streaming_tokens: int = 0,
) -> list[list[str]]

tokenize

tokenize(text: str) -> list[str]