vllm_gaudi.extension.bucketing.common ¶

LONG_CTX_THRESHOLD `module-attribute` ¶

LONG_CTX_THRESHOLD = 8192

HPUBucketingManager ¶

Source code in vllm_gaudi/extension/bucketing/common.py

class HPUBucketingManager():
    # Keep an active-manager handle for code paths that cannot access
    # the runner-local manager object directly (for example, fsdpa setup).
    _active_instance = None

    def __init__(self):
        # All mutable state must be instance-local to avoid cross-model
        # contamination when multiple runners are stashed/restored.
        self.prompt_buckets: List[Tuple[int, int, int]] = []
        self.decode_buckets: List[Tuple[int, int, int]] = []
        # Seed buckets are the buckets originally generated from bucketing configuration
        # Spec decode may automatically add new buckets based on the seed buckets
        self.seed_decode_buckets: List[Tuple[int, int, int]] | None = None
        self.initialized = False

    def activate(self):
        HPUBucketingManager._active_instance = self
        return self

    @classmethod
    def deactivate(cls):
        cls._active_instance = None

    def initialize(self,
                   max_num_seqs,
                   max_num_prefill_seqs,
                   block_size,
                   max_num_batched_tokens,
                   max_model_len,
                   num_speculative_tokens=0,
                   mamba_chunk_size=0,
                   mamba_chunk_size_is_explicit=False):
        self.activate()
        self.max_num_seqs = max_num_seqs
        self.max_num_prefill_seqs = max_num_prefill_seqs
        self.block_size = block_size
        self.max_num_batched_tokens = max_num_batched_tokens
        self.num_hpu_blocks = None
        self._fallback_max_ctx = 0
        self.max_model_len = max_model_len
        self.num_speculative_tokens = num_speculative_tokens
        self.mamba_chunk_size = mamba_chunk_size
        self.mamba_chunk_size_is_explicit = mamba_chunk_size_is_explicit
        self.initialized = True
        self.fallback_bs_base_step = 2
        self.fallback_seq_base_step = max(32, mamba_chunk_size)
        self.fallback_blocks_base_step = 32

        if mamba_chunk_size > 0 and self.max_num_batched_tokens % mamba_chunk_size != 0:
            raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) must be "
                             f"divisible by mamba_chunk_size ({mamba_chunk_size})")

        self.use_sliding_window = get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD
        if self.use_sliding_window:
            self.slice_size = get_config().PT_HPU_SDPA_BC_FACTOR if \
                get_config().PT_HPU_SDPA_BC_FACTOR is not None else 1024
            self.slice_thld = get_config().VLLM_FUSEDSDPA_SLIDE_THLD if \
                get_config().VLLM_FUSEDSDPA_SLIDE_THLD is not None else 8192

            msg = (
                f"use_sliding_window {self.use_sliding_window}, slice_size {self.slice_size}, threshold {self.slice_thld}"
            )
            logger().info(msg)

    ### GENERATE BUCKETS FUNCTIONS ###

    def read_from_file(self, is_prompt):
        file_name = get_config().VLLM_BUCKETING_FROM_FILE
        strategy = FileBucketingStrategy()
        return strategy.get_buckets(file_name, is_prompt)

    def get_bucketing_strategy(self):
        # TODO - we can use different strategies for decode and prompt
        bucketing_strategy = get_config().bucketing_strategy
        if bucketing_strategy == 'exp':
            strategy = ExponentialBucketingStrategy()
        elif bucketing_strategy == 'lin':
            strategy = LinearBucketingStrategy()
        elif bucketing_strategy == 'pad':
            strategy = PaddingAwareBucketingStrategy()
        else:
            raise ValueError(
                f"Invalid bucketing strategy: {bucketing_strategy}, please choose from ['exp', 'lin', 'pad']")

        # for backward compatibility - if VLLM_EXPONENTIAL_BUCKETING is set, it will override the bucketing strategy
        exp_bucketing_env = os.getenv('VLLM_EXPONENTIAL_BUCKETING', None)
        if exp_bucketing_env is not None:
            logger().warning(
                "VLLM_EXPONENTIAL_BUCKETING is deprecated and will be removed in a future release. Use VLLM_BUCKETING_STRATEGY='exp'|'lin'|'pad' instead."
            )
            use_exp_bucketing = boolean(exp_bucketing_env)
            if use_exp_bucketing:
                override_strategy = ExponentialBucketingStrategy()
            else:
                override_strategy = LinearBucketingStrategy()
            if override_strategy.__class__ != strategy.__class__:
                logger().warning(
                    f"Overriding bucketing strategy {strategy.__class__.__name__} with {override_strategy.__class__.__name__} due to VLLM_EXPONENTIAL_BUCKETING={exp_bucketing_env}"
                )
                strategy = override_strategy
        return strategy

    def generate_prompt_buckets(self):
        if self.initialized:
            buckets_from_file = None
            bs_range = []
            query_range = []
            ctx_range = []
            if get_config().VLLM_BUCKETING_FROM_FILE:
                buckets_from_file = self.read_from_file(is_prompt=True)
            else:
                strategy = self.get_bucketing_strategy()

                bs_cfg, query_cfg, ctx_cfg = strategy.get_prompt_cfgs(
                    max_num_prefill_seqs=self.max_num_prefill_seqs,
                    block_size=self.block_size,
                    max_num_batched_tokens=self.max_num_batched_tokens,
                    max_model_len=self.max_model_len)

                bs_range = strategy.get_range(bs_cfg)
                query_range = strategy.get_range(query_cfg)
                ctx_range = strategy.get_range(ctx_cfg)
                logger().debug(f"Prompt BS range: {bs_range}")
                logger().debug(f"Prompt query range: {query_range}")
                logger().debug(f"Prompt context range: {ctx_range}")

            self.prompt_buckets = generate_buckets(bs_range, query_range, ctx_range, True, self.max_model_len,
                                                   self.max_num_seqs, self.max_num_prefill_seqs,
                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
                                                   buckets_from_file, self.mamba_chunk_size,
                                                   self.mamba_chunk_size_is_explicit)
            self.log_generate_info(True)
            if self.use_sliding_window:
                self.prompt_buckets = [
                    t for t in self.prompt_buckets
                    if t[2] != 0 or (t[2] == 0 and (t[1] < self.slice_thld or
                                                    (t[1] >= self.slice_thld and t[1] % self.slice_size == 0)))
                ]
                self.log_generate_info(True)
        else:
            logger().info("Bucketing is off - skipping prompt buckets generation")
            self.prompt_buckets = []
        return

    def generate_decode_buckets(self):
        if self.initialized:
            buckets_from_file = None
            bs_range = []
            query_range = []
            ctx_range = []
            if get_config().VLLM_BUCKETING_FROM_FILE:
                buckets_from_file = self.read_from_file(is_prompt=False)
            else:
                strategy = self.get_bucketing_strategy()

                bs_cfg, query_cfg, ctx_cfg = strategy.get_decode_cfgs(
                    max_num_seqs=self.max_num_seqs,
                    block_size=self.block_size,
                    max_num_batched_tokens=self.max_num_batched_tokens,
                    max_model_len=self.max_model_len,
                    max_blocks=self.num_hpu_blocks)

                bs_range = strategy.get_range(bs_cfg)
                query_range = strategy.get_range(query_cfg)
                ctx_range = strategy.get_range(ctx_cfg)

                if get_config().use_contiguous_pa and ctx_range[-1] < self.num_hpu_blocks:
                    ctx_range.append(self.num_hpu_blocks)

                logger().debug(f"Decode BS range: {bs_range}")
                logger().debug(f"Decode query range: {query_range}")
                logger().debug(f"Decode context range: {ctx_range}")

            self.decode_buckets = generate_buckets(bs_range, query_range, ctx_range, False, self.max_model_len,
                                                   self.max_num_seqs, self.max_num_prefill_seqs,
                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
                                                   buckets_from_file, self.mamba_chunk_size,
                                                   self.mamba_chunk_size_is_explicit)
            if self.num_speculative_tokens:
                # The existing buckets are used as seed decode buckets
                self.seed_decode_buckets = self.decode_buckets
                # More buckets are added automatically for spec decode
                self.decode_buckets = self.generate_spec_decode_buckets(self.decode_buckets)
            # Safety cap for fallback: max ctx from ALL prepared decode buckets
            # (including spec decode expansions).  Prevents catastrophic
            # allocations from corrupt batch data while allowing
            # calc_fallback_value to handle moderate overflow.
            self._fallback_max_ctx = max((ctx for _, _, ctx in self.decode_buckets), default=0)

            self.log_generate_info(False)
        else:
            logger().info("Bucketing is off - skipping decode buckets generation")
            self.decode_buckets = []
        return

    def log_generate_info(self, is_prompt=False):
        phase = 'prompt' if is_prompt else 'decode'
        buckets = self.prompt_buckets if is_prompt else self.decode_buckets
        msg = (f"Generated {len(buckets)} "
               f"{phase} buckets [bs, query, num_blocks]: "
               f"{list(buckets)}")
        logger().info(msg)

    ### RETRIEVE BUCKETS FUNCTIONS ###

    def generate_fallback_bucket(self, batch_size, seq_len, ctx, is_prompt=False):
        assert self.max_num_batched_tokens is not None
        new_batch_size = calc_fallback_value(batch_size, self.fallback_bs_base_step)
        if new_batch_size > self.max_num_seqs:
            new_batch_size = self.max_num_seqs
        if self.use_sliding_window and seq_len >= self.slice_thld:
            new_seq_len = math.ceil(seq_len / self.slice_size) * self.slice_size
        else:
            new_seq_len = min(calc_fallback_value(seq_len, self.fallback_seq_base_step), self.max_num_batched_tokens)

        if self.num_hpu_blocks is None:
            new_ctx = 0
        else:
            new_ctx = calc_fallback_value(ctx, self.fallback_blocks_base_step)
            if is_prompt:
                # For prompt buckets, cap to the theoretical max number of
                # context blocks derived from max_model_len.
                max_prompt_ctx = math.ceil((self.max_model_len - 1) / self.block_size)
                if new_ctx > max_prompt_ctx:
                    logger().warning(f"Fallback prompt ctx {new_ctx} exceeds theoretical "
                                     f"max {max_prompt_ctx} (max_model_len="
                                     f"{self.max_model_len}, block_size={self.block_size}),"
                                     f" capping.")
                    new_ctx = max_prompt_ctx
            else:
                # Safety cap: limit to max prepared decode bucket ctx to prevent
                # catastrophic graph compilation from corrupted batch data.
                if self._fallback_max_ctx > 0 and new_ctx > self._fallback_max_ctx:
                    logger().warning(f"Fallback ctx {new_ctx} exceeds max prepared "
                                     f"decode bucket ctx {self._fallback_max_ctx}, capping.")
                    new_ctx = self._fallback_max_ctx
        return (new_batch_size, new_seq_len, new_ctx)

    def find_prompt_bucket(self, batch_size, seq_len, ctx=0):
        if self.initialized:
            found_bucket = find_equal_or_closest_greater_config(self.prompt_buckets, (batch_size, seq_len, ctx))
            if found_bucket is None:
                new_bucket = self.generate_fallback_bucket(batch_size, seq_len, ctx, is_prompt=True)
                logger().warning(f"Prompt bucket for {batch_size, seq_len, ctx}"
                                 f" was not prepared. Adding new bucket: {new_bucket}")
                self.prompt_buckets.append(new_bucket)
                self.prompt_buckets.sort()
                return new_bucket
            return found_bucket
        return (batch_size, seq_len, ctx)

    def find_decode_bucket(self, batch_size, num_blocks, seed_buckets: bool = False):
        if self.initialized:
            # Cap num_blocks to the max prepared ctx so that previously-added
            # capped fallback buckets are found on subsequent lookups, avoiding
            # an infinite loop of "was not prepared" warnings.
            if self._fallback_max_ctx > 0 and num_blocks > self._fallback_max_ctx:
                num_blocks = self._fallback_max_ctx

            if seed_buckets and self.seed_decode_buckets is not None:
                found_bucket = find_equal_or_closest_greater_config(self.seed_decode_buckets,
                                                                    (batch_size, 1, num_blocks))
                if found_bucket is not None:
                    return found_bucket

            found_bucket = find_equal_or_closest_greater_config(self.decode_buckets, (batch_size, 1, num_blocks))
            if found_bucket is None:
                new_bucket = self.generate_fallback_bucket(batch_size, 1, num_blocks)
                if new_bucket not in self.decode_buckets:
                    logger().warning(f"Decode bucket for {batch_size, 1, num_blocks}"
                                     f" was not prepared. Adding new bucket: {new_bucket}")
                    self.decode_buckets.append(new_bucket)
                    self.decode_buckets.sort()
                return new_bucket
            return found_bucket
        return (batch_size, 1, num_blocks)

    def get_max_prompt_shape(self):
        return max(b[1] for b in self.prompt_buckets) \
               if len(self.prompt_buckets) > 0 else self.max_model_len

    def generate_spec_decode_buckets(self, seed_decode_buckets):
        max_model_len = self.max_model_len
        block_size = self.block_size

        def no_corrections(bs, query, ctx):
            return (bs, query, ctx)

        def correct_for_max_model_len(bs, query, ctx):
            return (bs, query, min(ctx, bs * math.ceil(max_model_len / block_size)))

        def get_corrector(use_contiguous_pa):
            if use_contiguous_pa:
                return no_corrections
            else:
                return correct_for_max_model_len

        use_contiguous_pa = get_config().use_contiguous_pa
        corrector = get_corrector(use_contiguous_pa)

        # If spec decode enabled, generate buckets for batch_size * (1 + num_speculative_tokens)
        num_tokens = 1 + self.num_speculative_tokens
        buckets = set()
        for bucket in seed_decode_buckets:
            buckets.add(bucket)
            bs, query, ctx = bucket
            spec_decode_bs = bs * num_tokens
            if spec_decode_bs <= ctx:
                # Add a bucket with (batch_size * num_tokens, query, ctx)
                buckets.add(corrector(spec_decode_bs, query, ctx))
            # Add a bucket with (batch_size * num_tokens, query, ctx * num_tokens)
            buckets.add(corrector(spec_decode_bs, query, ctx * num_tokens))

        # Log the new generated spec decode buckets
        new_buckets = sorted(buckets - set(seed_decode_buckets))
        msg = (f"Generated {len(new_buckets)} "
               f"spec decode buckets [bs, query, num_blocks]: {list(new_buckets)}")
        logger().info(msg)

        return sorted(buckets)

    @classmethod
    def get_instance(cls):
        """Retrieve the currently active bucketing manager instance."""
        if cls._active_instance is None:
            raise RuntimeError("No active HPUBucketingManager instance. Call initialize() or activate() first.")
        return cls._active_instance

_active_instance `class-attribute` `instance-attribute` ¶

_active_instance = None

decode_buckets `instance-attribute` ¶

decode_buckets: List[Tuple[int, int, int]] = []

initialized `instance-attribute` ¶

initialized = False

prompt_buckets `instance-attribute` ¶

prompt_buckets: List[Tuple[int, int, int]] = []

seed_decode_buckets `instance-attribute` ¶

seed_decode_buckets: List[Tuple[int, int, int]] | None = (
    None
)

init ¶

__init__()

Source code in vllm_gaudi/extension/bucketing/common.py

def __init__(self):
    # All mutable state must be instance-local to avoid cross-model
    # contamination when multiple runners are stashed/restored.
    self.prompt_buckets: List[Tuple[int, int, int]] = []
    self.decode_buckets: List[Tuple[int, int, int]] = []
    # Seed buckets are the buckets originally generated from bucketing configuration
    # Spec decode may automatically add new buckets based on the seed buckets
    self.seed_decode_buckets: List[Tuple[int, int, int]] | None = None
    self.initialized = False

activate ¶

activate()

Source code in vllm_gaudi/extension/bucketing/common.py

def activate(self):
    HPUBucketingManager._active_instance = self
    return self

deactivate `classmethod` ¶

deactivate()

Source code in vllm_gaudi/extension/bucketing/common.py

@classmethod
def deactivate(cls):
    cls._active_instance = None

find_decode_bucket ¶

find_decode_bucket(
    batch_size, num_blocks, seed_buckets: bool = False
)

Source code in vllm_gaudi/extension/bucketing/common.py

def find_decode_bucket(self, batch_size, num_blocks, seed_buckets: bool = False):
    if self.initialized:
        # Cap num_blocks to the max prepared ctx so that previously-added
        # capped fallback buckets are found on subsequent lookups, avoiding
        # an infinite loop of "was not prepared" warnings.
        if self._fallback_max_ctx > 0 and num_blocks > self._fallback_max_ctx:
            num_blocks = self._fallback_max_ctx

        if seed_buckets and self.seed_decode_buckets is not None:
            found_bucket = find_equal_or_closest_greater_config(self.seed_decode_buckets,
                                                                (batch_size, 1, num_blocks))
            if found_bucket is not None:
                return found_bucket

        found_bucket = find_equal_or_closest_greater_config(self.decode_buckets, (batch_size, 1, num_blocks))
        if found_bucket is None:
            new_bucket = self.generate_fallback_bucket(batch_size, 1, num_blocks)
            if new_bucket not in self.decode_buckets:
                logger().warning(f"Decode bucket for {batch_size, 1, num_blocks}"
                                 f" was not prepared. Adding new bucket: {new_bucket}")
                self.decode_buckets.append(new_bucket)
                self.decode_buckets.sort()
            return new_bucket
        return found_bucket
    return (batch_size, 1, num_blocks)

find_prompt_bucket ¶

find_prompt_bucket(batch_size, seq_len, ctx=0)

Source code in vllm_gaudi/extension/bucketing/common.py

def find_prompt_bucket(self, batch_size, seq_len, ctx=0):
    if self.initialized:
        found_bucket = find_equal_or_closest_greater_config(self.prompt_buckets, (batch_size, seq_len, ctx))
        if found_bucket is None:
            new_bucket = self.generate_fallback_bucket(batch_size, seq_len, ctx, is_prompt=True)
            logger().warning(f"Prompt bucket for {batch_size, seq_len, ctx}"
                             f" was not prepared. Adding new bucket: {new_bucket}")
            self.prompt_buckets.append(new_bucket)
            self.prompt_buckets.sort()
            return new_bucket
        return found_bucket
    return (batch_size, seq_len, ctx)

generate_decode_buckets ¶

generate_decode_buckets()

Source code in vllm_gaudi/extension/bucketing/common.py

def generate_decode_buckets(self):
    if self.initialized:
        buckets_from_file = None
        bs_range = []
        query_range = []
        ctx_range = []
        if get_config().VLLM_BUCKETING_FROM_FILE:
            buckets_from_file = self.read_from_file(is_prompt=False)
        else:
            strategy = self.get_bucketing_strategy()

            bs_cfg, query_cfg, ctx_cfg = strategy.get_decode_cfgs(
                max_num_seqs=self.max_num_seqs,
                block_size=self.block_size,
                max_num_batched_tokens=self.max_num_batched_tokens,
                max_model_len=self.max_model_len,
                max_blocks=self.num_hpu_blocks)

            bs_range = strategy.get_range(bs_cfg)
            query_range = strategy.get_range(query_cfg)
            ctx_range = strategy.get_range(ctx_cfg)

            if get_config().use_contiguous_pa and ctx_range[-1] < self.num_hpu_blocks:
                ctx_range.append(self.num_hpu_blocks)

            logger().debug(f"Decode BS range: {bs_range}")
            logger().debug(f"Decode query range: {query_range}")
            logger().debug(f"Decode context range: {ctx_range}")

        self.decode_buckets = generate_buckets(bs_range, query_range, ctx_range, False, self.max_model_len,
                                               self.max_num_seqs, self.max_num_prefill_seqs,
                                               self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
                                               buckets_from_file, self.mamba_chunk_size,
                                               self.mamba_chunk_size_is_explicit)
        if self.num_speculative_tokens:
            # The existing buckets are used as seed decode buckets
            self.seed_decode_buckets = self.decode_buckets
            # More buckets are added automatically for spec decode
            self.decode_buckets = self.generate_spec_decode_buckets(self.decode_buckets)
        # Safety cap for fallback: max ctx from ALL prepared decode buckets
        # (including spec decode expansions).  Prevents catastrophic
        # allocations from corrupt batch data while allowing
        # calc_fallback_value to handle moderate overflow.
        self._fallback_max_ctx = max((ctx for _, _, ctx in self.decode_buckets), default=0)

        self.log_generate_info(False)
    else:
        logger().info("Bucketing is off - skipping decode buckets generation")
        self.decode_buckets = []
    return

generate_fallback_bucket ¶

generate_fallback_bucket(
    batch_size, seq_len, ctx, is_prompt=False
)

Source code in vllm_gaudi/extension/bucketing/common.py

def generate_fallback_bucket(self, batch_size, seq_len, ctx, is_prompt=False):
    assert self.max_num_batched_tokens is not None
    new_batch_size = calc_fallback_value(batch_size, self.fallback_bs_base_step)
    if new_batch_size > self.max_num_seqs:
        new_batch_size = self.max_num_seqs
    if self.use_sliding_window and seq_len >= self.slice_thld:
        new_seq_len = math.ceil(seq_len / self.slice_size) * self.slice_size
    else:
        new_seq_len = min(calc_fallback_value(seq_len, self.fallback_seq_base_step), self.max_num_batched_tokens)

    if self.num_hpu_blocks is None:
        new_ctx = 0
    else:
        new_ctx = calc_fallback_value(ctx, self.fallback_blocks_base_step)
        if is_prompt:
            # For prompt buckets, cap to the theoretical max number of
            # context blocks derived from max_model_len.
            max_prompt_ctx = math.ceil((self.max_model_len - 1) / self.block_size)
            if new_ctx > max_prompt_ctx:
                logger().warning(f"Fallback prompt ctx {new_ctx} exceeds theoretical "
                                 f"max {max_prompt_ctx} (max_model_len="
                                 f"{self.max_model_len}, block_size={self.block_size}),"
                                 f" capping.")
                new_ctx = max_prompt_ctx
        else:
            # Safety cap: limit to max prepared decode bucket ctx to prevent
            # catastrophic graph compilation from corrupted batch data.
            if self._fallback_max_ctx > 0 and new_ctx > self._fallback_max_ctx:
                logger().warning(f"Fallback ctx {new_ctx} exceeds max prepared "
                                 f"decode bucket ctx {self._fallback_max_ctx}, capping.")
                new_ctx = self._fallback_max_ctx
    return (new_batch_size, new_seq_len, new_ctx)

generate_prompt_buckets ¶

generate_prompt_buckets()

Source code in vllm_gaudi/extension/bucketing/common.py

def generate_prompt_buckets(self):
    if self.initialized:
        buckets_from_file = None
        bs_range = []
        query_range = []
        ctx_range = []
        if get_config().VLLM_BUCKETING_FROM_FILE:
            buckets_from_file = self.read_from_file(is_prompt=True)
        else:
            strategy = self.get_bucketing_strategy()

            bs_cfg, query_cfg, ctx_cfg = strategy.get_prompt_cfgs(
                max_num_prefill_seqs=self.max_num_prefill_seqs,
                block_size=self.block_size,
                max_num_batched_tokens=self.max_num_batched_tokens,
                max_model_len=self.max_model_len)

            bs_range = strategy.get_range(bs_cfg)
            query_range = strategy.get_range(query_cfg)
            ctx_range = strategy.get_range(ctx_cfg)
            logger().debug(f"Prompt BS range: {bs_range}")
            logger().debug(f"Prompt query range: {query_range}")
            logger().debug(f"Prompt context range: {ctx_range}")

        self.prompt_buckets = generate_buckets(bs_range, query_range, ctx_range, True, self.max_model_len,
                                               self.max_num_seqs, self.max_num_prefill_seqs,
                                               self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
                                               buckets_from_file, self.mamba_chunk_size,
                                               self.mamba_chunk_size_is_explicit)
        self.log_generate_info(True)
        if self.use_sliding_window:
            self.prompt_buckets = [
                t for t in self.prompt_buckets
                if t[2] != 0 or (t[2] == 0 and (t[1] < self.slice_thld or
                                                (t[1] >= self.slice_thld and t[1] % self.slice_size == 0)))
            ]
            self.log_generate_info(True)
    else:
        logger().info("Bucketing is off - skipping prompt buckets generation")
        self.prompt_buckets = []
    return

generate_spec_decode_buckets ¶

generate_spec_decode_buckets(seed_decode_buckets)

Source code in vllm_gaudi/extension/bucketing/common.py

def generate_spec_decode_buckets(self, seed_decode_buckets):
    max_model_len = self.max_model_len
    block_size = self.block_size

    def no_corrections(bs, query, ctx):
        return (bs, query, ctx)

    def correct_for_max_model_len(bs, query, ctx):
        return (bs, query, min(ctx, bs * math.ceil(max_model_len / block_size)))

    def get_corrector(use_contiguous_pa):
        if use_contiguous_pa:
            return no_corrections
        else:
            return correct_for_max_model_len

    use_contiguous_pa = get_config().use_contiguous_pa
    corrector = get_corrector(use_contiguous_pa)

    # If spec decode enabled, generate buckets for batch_size * (1 + num_speculative_tokens)
    num_tokens = 1 + self.num_speculative_tokens
    buckets = set()
    for bucket in seed_decode_buckets:
        buckets.add(bucket)
        bs, query, ctx = bucket
        spec_decode_bs = bs * num_tokens
        if spec_decode_bs <= ctx:
            # Add a bucket with (batch_size * num_tokens, query, ctx)
            buckets.add(corrector(spec_decode_bs, query, ctx))
        # Add a bucket with (batch_size * num_tokens, query, ctx * num_tokens)
        buckets.add(corrector(spec_decode_bs, query, ctx * num_tokens))

    # Log the new generated spec decode buckets
    new_buckets = sorted(buckets - set(seed_decode_buckets))
    msg = (f"Generated {len(new_buckets)} "
           f"spec decode buckets [bs, query, num_blocks]: {list(new_buckets)}")
    logger().info(msg)

    return sorted(buckets)

get_bucketing_strategy ¶

get_bucketing_strategy()

Source code in vllm_gaudi/extension/bucketing/common.py

def get_bucketing_strategy(self):
    # TODO - we can use different strategies for decode and prompt
    bucketing_strategy = get_config().bucketing_strategy
    if bucketing_strategy == 'exp':
        strategy = ExponentialBucketingStrategy()
    elif bucketing_strategy == 'lin':
        strategy = LinearBucketingStrategy()
    elif bucketing_strategy == 'pad':
        strategy = PaddingAwareBucketingStrategy()
    else:
        raise ValueError(
            f"Invalid bucketing strategy: {bucketing_strategy}, please choose from ['exp', 'lin', 'pad']")

    # for backward compatibility - if VLLM_EXPONENTIAL_BUCKETING is set, it will override the bucketing strategy
    exp_bucketing_env = os.getenv('VLLM_EXPONENTIAL_BUCKETING', None)
    if exp_bucketing_env is not None:
        logger().warning(
            "VLLM_EXPONENTIAL_BUCKETING is deprecated and will be removed in a future release. Use VLLM_BUCKETING_STRATEGY='exp'|'lin'|'pad' instead."
        )
        use_exp_bucketing = boolean(exp_bucketing_env)
        if use_exp_bucketing:
            override_strategy = ExponentialBucketingStrategy()
        else:
            override_strategy = LinearBucketingStrategy()
        if override_strategy.__class__ != strategy.__class__:
            logger().warning(
                f"Overriding bucketing strategy {strategy.__class__.__name__} with {override_strategy.__class__.__name__} due to VLLM_EXPONENTIAL_BUCKETING={exp_bucketing_env}"
            )
            strategy = override_strategy
    return strategy

get_instance `classmethod` ¶

get_instance()

Retrieve the currently active bucketing manager instance.

Source code in vllm_gaudi/extension/bucketing/common.py

@classmethod
def get_instance(cls):
    """Retrieve the currently active bucketing manager instance."""
    if cls._active_instance is None:
        raise RuntimeError("No active HPUBucketingManager instance. Call initialize() or activate() first.")
    return cls._active_instance

get_max_prompt_shape ¶

get_max_prompt_shape()

Source code in vllm_gaudi/extension/bucketing/common.py

def get_max_prompt_shape(self):
    return max(b[1] for b in self.prompt_buckets) \
           if len(self.prompt_buckets) > 0 else self.max_model_len

initialize ¶

initialize(
    max_num_seqs,
    max_num_prefill_seqs,
    block_size,
    max_num_batched_tokens,
    max_model_len,
    num_speculative_tokens=0,
    mamba_chunk_size=0,
    mamba_chunk_size_is_explicit=False,
)

Source code in vllm_gaudi/extension/bucketing/common.py

def initialize(self,
               max_num_seqs,
               max_num_prefill_seqs,
               block_size,
               max_num_batched_tokens,
               max_model_len,
               num_speculative_tokens=0,
               mamba_chunk_size=0,
               mamba_chunk_size_is_explicit=False):
    self.activate()
    self.max_num_seqs = max_num_seqs
    self.max_num_prefill_seqs = max_num_prefill_seqs
    self.block_size = block_size
    self.max_num_batched_tokens = max_num_batched_tokens
    self.num_hpu_blocks = None
    self._fallback_max_ctx = 0
    self.max_model_len = max_model_len
    self.num_speculative_tokens = num_speculative_tokens
    self.mamba_chunk_size = mamba_chunk_size
    self.mamba_chunk_size_is_explicit = mamba_chunk_size_is_explicit
    self.initialized = True
    self.fallback_bs_base_step = 2
    self.fallback_seq_base_step = max(32, mamba_chunk_size)
    self.fallback_blocks_base_step = 32

    if mamba_chunk_size > 0 and self.max_num_batched_tokens % mamba_chunk_size != 0:
        raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) must be "
                         f"divisible by mamba_chunk_size ({mamba_chunk_size})")

    self.use_sliding_window = get_config().PT_HPU_SDPA_QKV_SLICE_MODE_FWD
    if self.use_sliding_window:
        self.slice_size = get_config().PT_HPU_SDPA_BC_FACTOR if \
            get_config().PT_HPU_SDPA_BC_FACTOR is not None else 1024
        self.slice_thld = get_config().VLLM_FUSEDSDPA_SLIDE_THLD if \
            get_config().VLLM_FUSEDSDPA_SLIDE_THLD is not None else 8192

        msg = (
            f"use_sliding_window {self.use_sliding_window}, slice_size {self.slice_size}, threshold {self.slice_thld}"
        )
        logger().info(msg)

log_generate_info ¶

log_generate_info(is_prompt=False)

Source code in vllm_gaudi/extension/bucketing/common.py

def log_generate_info(self, is_prompt=False):
    phase = 'prompt' if is_prompt else 'decode'
    buckets = self.prompt_buckets if is_prompt else self.decode_buckets
    msg = (f"Generated {len(buckets)} "
           f"{phase} buckets [bs, query, num_blocks]: "
           f"{list(buckets)}")
    logger().info(msg)

read_from_file ¶

read_from_file(is_prompt)

Source code in vllm_gaudi/extension/bucketing/common.py

def read_from_file(self, is_prompt):
    file_name = get_config().VLLM_BUCKETING_FROM_FILE
    strategy = FileBucketingStrategy()
    return strategy.get_buckets(file_name, is_prompt)

calc_fallback_value ¶

calc_fallback_value(n: int, base_step: int)

Calculate next bucket for yet unbucketized value

Source code in vllm_gaudi/extension/bucketing/common.py

def calc_fallback_value(n: int, base_step: int):
    """ Calculate next bucket for yet unbucketized value"""
    if n <= 1:
        return n
    power = 1 / 3
    # The basic idea is that we first estimate bucket size based
    # on exponent of the number, so higher numbers will generate
    # bigger gaps between individual buckets, but it's not as steep
    # as exponential bucketing. Additionally this has a nice
    # property that generated values are guaranteed to be divisible
    # by base_step
    #
    # examples:
    # n=31, base_step=32
    #   => bucket_size = ceil(31^1/3) * 32 = 4 * 32 = 128
    #   => next_value = round_up(31, 128) = 128
    # n=4001, base_step=32
    #   => bucket_size = ceil(4001^1/3) * 32 = 16 * 32 = 512
    #   => next_value = round_up(4001, 512) = 4096
    bucket_size = math.ceil(math.pow(n, power)) * base_step
    return math.ceil(n / bucket_size) * bucket_size

find_equal_or_closest_greater_config ¶

find_equal_or_closest_greater_config(
    sorted_list, target_tuple
)

Source code in vllm_gaudi/extension/bucketing/common.py

def find_equal_or_closest_greater_config(sorted_list, target_tuple):
    idx = bisect.bisect_left(sorted_list, target_tuple)
    for i in range(idx, len(sorted_list)):
        if is_greater_or_equal(sorted_list[i], target_tuple):
            return sorted_list[i]
    return None

generate_buckets ¶

generate_buckets(
    bs_range,
    query_range,
    ctx_range,
    is_prompt,
    max_model_len,
    max_num_seqs,
    max_num_prefill_seqs,
    max_num_batched_tokens,
    block_size,
    max_blocks,
    file_buckets=None,
    mamba_chunk_size=0,
    mamba_chunk_size_is_explicit=False,
)

Source code in vllm_gaudi/extension/bucketing/common.py

def generate_buckets(bs_range,
                     query_range,
                     ctx_range,
                     is_prompt,
                     max_model_len,
                     max_num_seqs,
                     max_num_prefill_seqs,
                     max_num_batched_tokens,
                     block_size,
                     max_blocks,
                     file_buckets=None,
                     mamba_chunk_size=0,
                     mamba_chunk_size_is_explicit=False):
    use_merged_prefill = get_config().merged_prefill
    use_contiguous_pa = get_config().use_contiguous_pa

    if is_prompt and mamba_chunk_size > 0 and mamba_chunk_size_is_explicit:
        query_range = [math.ceil(query / mamba_chunk_size) * mamba_chunk_size for query in query_range]

    def expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range, max_num_batched_tokens):
        '''
        Expand 2d bucket (bs, query) to include:
        - itself
        - next bs value (if any)
        - next query value (if any)
        - next bs and query values together (if both exists)
        This cover case when our configuration is in budget but between
        values that are in and out of budget:
        bs < edge_case_bs < next bs and query < edge_case_query < next query
        '''

        candidates = [(bs_idx, ctx_idx), (bs_idx + 1, ctx_idx), (bs_idx, ctx_idx + 1), (bs_idx + 1, ctx_idx + 1)]
        valid_candidates = [(b_idx, q_idx) for b_idx, q_idx in candidates
                            if b_idx < len(bs_range) and q_idx < len(ctx_range)]
        return {(bs_range[b_idx], ctx_range[q_idx]) for b_idx, q_idx in valid_candidates}

    # filter rules for buckets
    # prompt
    def not_over_max_model_len(bs, query, ctx):
        # For chunked prefill, the final query chunk can be shorter than the
        # bucket query size due to padding. Keep buckets as long as context
        # blocks can represent at least one valid context length with one
        # query token available.
        # The -1 reserves at least 1 token for query so that when
        # max_model_len is exactly divisible by block_size, the ceiling
        # ctx is not over-counted.
        # Also reject buckets whose query alone exceeds max_model_len,
        # since such a prompt chunk is never reachable at runtime.
        max_ctx_blocks = math.ceil((max_model_len - 1) / block_size)
        smaller_than_limit = ctx <= max_ctx_blocks and query <= max_model_len
        if not smaller_than_limit:
            omitted_buckets.add(("condition: ctx <= ceil((max_model_len - 1) / block_size) and query <= max_model_len",
                                 "-> bs, query, ctx: ", bs, query, ctx))
        return smaller_than_limit

    def not_over_max_num_batched_tokens(bs, query, ctx):
        smaller_than_limit = bs * query <= max_num_batched_tokens
        if not smaller_than_limit:
            omitted_buckets.add(
                ("condition: bs * query <= max_num_batched_tokens", "-> bs, query, ctx: ", bs, query, ctx))
        return smaller_than_limit

    def ctx_not_over_max_ctx_for_merged_prefill(bs, query, ctx):
        smaller_than_limit = ctx <= max_num_prefill_seqs * math.ceil(
            (max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size)
        if not smaller_than_limit:
            omitted_buckets.add((
                "ctx <= max_num_prefill_seqs * math.ceil((max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size)",
                "-> bs, query, ctx: ", bs, query, ctx))
        return smaller_than_limit

    def no_corrections(bs, query, ctx):
        return (bs, query, ctx)

    def correct_for_max_model_len(bs, query, ctx):
        return (bs, query, min(ctx, bs * math.ceil(max_model_len / block_size)))

    def batch_size_smaller_than_blocks(bs, query, ctx):
        if not bs <= ctx:
            omitted_buckets.add(("condition: bs <= ctx, ", "-> bs, query, ctx: ", bs, query, ctx))
        return bs <= ctx

    filters_map = {
        "prompt": {
            # depends only on merged_prefill
            True: [ctx_not_over_max_ctx_for_merged_prefill],
            False: [not_over_max_model_len, not_over_max_num_batched_tokens],
        },
        "decode": {
            # depends only on contiguous PA
            True: [],
            False: [batch_size_smaller_than_blocks],
        }
    }

    def get_filters(is_prompt, use_merged_prefill, use_contiguous_pa):
        phase = "prompt" if is_prompt else "decode"
        if is_prompt:
            return filters_map[phase][use_merged_prefill]
        return filters_map[phase][use_contiguous_pa]

    def get_corrector(is_prompt, use_contiguous_pa):
        if is_prompt or use_contiguous_pa:
            return no_corrections
        else:
            return correct_for_max_model_len

    def get_max_bucket_per_query(bs, query):
        return (bs, query, math.ceil((max_model_len - query) // block_size))

    def is_ctx_allowed(ctx):
        ctx_bucket_max = get_config().VLLM_PROMPT_CTX_BUCKET_MAX
        return ctx >= 0 and (ctx_bucket_max is None or ctx < ctx_bucket_max)

    buckets = set()
    buckets_2d = set()
    omitted_buckets = set()
    filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
    corrector = get_corrector(is_prompt, use_contiguous_pa)

    if file_buckets:
        for bs, query, blocks in file_buckets:
            if all(bucket_filter(bs, query, blocks) for bucket_filter in filters):
                buckets.add(corrector(bs, query, blocks))
    else:
        for bs_idx, bs in enumerate(bs_range):
            for ctx_idx, ctx in enumerate(ctx_range):
                local_buckets = expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range,
                                                           max_num_batched_tokens) if not is_prompt else {(bs, ctx)}
                buckets_2d.update(local_buckets)
        max_ctx = max(ctx for _, ctx in buckets_2d)
        for bs, ctx in buckets_2d:
            is_max_ctx = ctx == max_ctx
            for query in query_range:
                if is_prompt and is_max_ctx and max_model_len >= LONG_CTX_THRESHOLD:  # only for long ctx
                    bs, query, edge_ctx = get_max_bucket_per_query(bs, query)
                    if is_ctx_allowed(edge_ctx):
                        ctx = edge_ctx
                if all(bucket_filter(bs, query, ctx) for bucket_filter in filters):
                    buckets.add(corrector(bs, query, ctx))
    if file_buckets and omitted_buckets:
        phase = 'prompt' if is_prompt else 'decode'
        logger().warning(
            "The following %s buckets from custom bucketing file "
            "(VLLM_BUCKETING_FROM_FILE) were omitted during warmup:", phase)
        for bucket in sorted(omitted_buckets):
            logger().warning("  %s", bucket)

    if not buckets:
        phase = 'prompt' if is_prompt else 'decode'
        for bucket in omitted_buckets:
            logger().error(bucket)
        raise RuntimeError("Generated 0 " + phase +
                           " buckets. Please adjust the bucketing configuration according to README")

    if logger().getEffectiveLevel() <= logging.DEBUG and omitted_buckets:
        phase = "prompt" if is_prompt else "decode"
        omitted_buckets_str = "\n".join(map(str, sorted(omitted_buckets)))
        msg = f"Omitted {len(omitted_buckets)} {phase} buckets:\n{omitted_buckets_str}"
        logger().debug(msg)

    return sorted(buckets)

get_bucketing_manager ¶

get_bucketing_manager()

Source code in vllm_gaudi/extension/bucketing/common.py

def get_bucketing_manager():
    instance = HPUBucketingManager.get_instance()
    return instance

is_greater_or_equal ¶

is_greater_or_equal(tuple1, tuple2)

Source code in vllm_gaudi/extension/bucketing/common.py

def is_greater_or_equal(tuple1, tuple2):
    return tuple1[0] >= tuple2[0] and tuple1[1] >= tuple2[1] \
           and tuple1[2] >= tuple2[2]

vllm_gaudi.extension.bucketing.common ¶

LONG_CTX_THRESHOLD module-attribute ¶

HPUBucketingManager ¶

_active_instance class-attribute instance-attribute ¶

decode_buckets instance-attribute ¶

initialized instance-attribute ¶

prompt_buckets instance-attribute ¶

seed_decode_buckets instance-attribute ¶

__init__ ¶

activate ¶

deactivate classmethod ¶

find_decode_bucket ¶

find_prompt_bucket ¶

generate_decode_buckets ¶

generate_fallback_bucket ¶

generate_prompt_buckets ¶

generate_spec_decode_buckets ¶

get_bucketing_strategy ¶

get_instance classmethod ¶

get_max_prompt_shape ¶

initialize ¶

log_generate_info ¶

read_from_file ¶

calc_fallback_value ¶

find_equal_or_closest_greater_config ¶

generate_buckets ¶

get_bucketing_manager ¶

is_greater_or_equal ¶

LONG_CTX_THRESHOLD `module-attribute` ¶

_active_instance `class-attribute` `instance-attribute` ¶

decode_buckets `instance-attribute` ¶

initialized `instance-attribute` ¶

prompt_buckets `instance-attribute` ¶

seed_decode_buckets `instance-attribute` ¶

init ¶

deactivate `classmethod` ¶

get_instance `classmethod` ¶