Skip to content

vllm.model_executor.layers.fused_moe.experts.fused_humming_moe

Fused MoE utilities for Humming.

Classes:

BatchedHummingGroupedExperts

Bases: HummingExpertsBase

Methods:

  • apply

    Standard apply implementation for Humming batched grouped experts.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
class BatchedHummingGroupedExperts(HummingExpertsBase):
    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceDelegate()

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def humming_gemm_type() -> "HummingGemmType":
        from vllm.utils.humming import GemmType as HummingGemmType

        return HummingGemmType.GROUPED_MASKED

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ) -> None:
        """
        Standard apply implementation for Humming batched grouped experts.

        Note: Humming kernels handle weights and quantization internally through
        the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
        The output is written into workspace13 via the buffer management.
        """
        from vllm.utils.humming import HummingMethod

        assert not apply_router_weight_on_input
        assert expert_tokens_meta is not None

        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
        valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)
        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        buffers = self.prepare_buffers(
            workspace13,
            workspace2,
            topk_ids.size(0),
            topk_ids.size(1),
            activation,
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=hidden_states,
            quanted_input=buffers.get("quanted_gate_up_input", None),
            sublayer_name="w13",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["gate_up_output"],
            valid_shape_m=valid_shape_m,
            expert_layout=expert_num_tokens,
            compute_config=self.compute_config_str,
            tuning_config=self.w13_tuning_config_str,
            sublayer_name="w13",
        )

        self.apply_activation(
            activation=activation,
            input=buffers["gate_up_output"],
            output=buffers["activation_output"],
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=buffers["activation_output"],
            quanted_input=buffers.get("quanted_down_input", None),
            sublayer_name="w2",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
            valid_shape_m=valid_shape_m,
            expert_layout=expert_num_tokens,
            compute_config=self.compute_config_str,
            tuning_config=self.w2_tuning_config_str,
            sublayer_name="w2",
        )

apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input)

Standard apply implementation for Humming batched grouped experts.

Note: Humming kernels handle weights and quantization internally through the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used. The output is written into workspace13 via the buffer management.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None:
    """
    Standard apply implementation for Humming batched grouped experts.

    Note: Humming kernels handle weights and quantization internally through
    the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
    The output is written into workspace13 via the buffer management.
    """
    from vllm.utils.humming import HummingMethod

    assert not apply_router_weight_on_input
    assert expert_tokens_meta is not None

    hidden_states = hidden_states.view(-1, hidden_states.size(-1))
    valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)
    expert_num_tokens = expert_tokens_meta.expert_num_tokens

    buffers = self.prepare_buffers(
        workspace13,
        workspace2,
        topk_ids.size(0),
        topk_ids.size(1),
        activation,
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=hidden_states,
        quanted_input=buffers.get("quanted_gate_up_input", None),
        sublayer_name="w13",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["gate_up_output"],
        valid_shape_m=valid_shape_m,
        expert_layout=expert_num_tokens,
        compute_config=self.compute_config_str,
        tuning_config=self.w13_tuning_config_str,
        sublayer_name="w13",
    )

    self.apply_activation(
        activation=activation,
        input=buffers["gate_up_output"],
        output=buffers["activation_output"],
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=buffers["activation_output"],
        quanted_input=buffers.get("quanted_down_input", None),
        sublayer_name="w2",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
        valid_shape_m=valid_shape_m,
        expert_layout=expert_num_tokens,
        compute_config=self.compute_config_str,
        tuning_config=self.w2_tuning_config_str,
        sublayer_name="w2",
    )

HummingExpertsBase

Bases: FusedMoEExpertsModular

Attributes:

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
class HummingExpertsBase(mk.FusedMoEExpertsModular):
    def __init__(
        self,
        layer: "RoutedExperts",
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
        max_num_tokens: int | None = None,
        num_dispatchers: int | None = None,
    ):
        self.layer = layer
        self.num_experts = self.layer.num_experts
        self.global_num_experts = self.layer.global_num_experts
        self.init_humming_moe()

        if self.is_batched():
            assert max_num_tokens is not None and num_dispatchers is not None

        super().__init__(
            moe_config=moe_config,
            quant_config=quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=num_dispatchers,
        )
        self._permute_scratch: MoEPermuteScratch | None = None

    def init_humming_moe(self):
        from vllm.utils.humming import HummingMethod

        self.compute_config = {
            "use_batch_invariant": envs.VLLM_BATCH_INVARIANT,
            "use_f16_accum": envs.VLLM_HUMMING_USE_F16_ACCUM,
            "gemm_type": self.humming_gemm_type().value,
        }
        self.w13_tuning_config = HummingMethod.get_default_tuning_configs(
            layer=self.layer,
            use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
            use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
            gemm_type=self.humming_gemm_type(),
            sublayer_name="w13",
        )
        self.w2_tuning_config = HummingMethod.get_default_tuning_configs(
            layer=self.layer,
            use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
            use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
            gemm_type=self.humming_gemm_type(),
            sublayer_name="w2",
        )
        self.compute_config_str = json.dumps(self.compute_config)
        self.w13_tuning_config_str = json.dumps(self.w13_tuning_config)
        self.w2_tuning_config_str = json.dumps(self.w2_tuning_config)

    def _get_permute_scratch(self) -> MoEPermuteScratch | None:
        if self._permute_scratch is None and moe_permute_unpermute_supported():
            self._permute_scratch = MoEPermuteScratch(
                max_num_tokens=self.moe_config.max_num_tokens,
                topk=self.moe_config.experts_per_token,
                num_experts=self.moe_config.num_experts,
                num_local_experts=self.moe_config.num_local_experts,
                device=torch.device(self.moe_config.device),
                hidden_size=self.moe_config.hidden_dim,
                hidden_dtype=self.moe_config.in_dtype,
            )
        return self._permute_scratch

    def get_global_valid_shape_m(self, topk_ids: torch.Tensor):
        num_tokens = topk_ids.size(0)
        ctx = get_forward_context()
        if ctx.dp_metadata is not None:
            num_tokens = ctx.dp_metadata.num_tokens_across_dp_cpu.sum().item()

        return num_tokens * topk_ids.size(1)

    def estimate_local_valid_shape_m(self, topk_ids: torch.Tensor):
        # estimate shape_m for kernel tuning
        global_valid_shape_m = self.get_global_valid_shape_m(topk_ids)
        num_experts = self.num_experts
        global_num_experts = self.global_num_experts
        return math.ceil(global_valid_shape_m * num_experts / global_num_experts)

    @staticmethod
    def humming_gemm_type() -> "HummingGemmType":
        raise NotImplementedError

    @classmethod
    def is_batched(cls) -> bool:
        return cls.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        SUPPORTED_W_A = [
            (kMxfp4Static, None),
            (kMxfp4Static, kMxfp4Dynamic),
            (kMxfp4Static, kMxfp8Dynamic),
            (kMxfp4Static, kFp8DynamicTokenSym),
            (kNvfp4Static, None),
            (kNvfp4Static, kFp8DynamicTokenSym),
            (kMxfp8Static, None),
            (kMxfp8Static, kFp8DynamicTokenSym),
            (kFp8StaticChannelSym, None),
            (kFp8StaticChannelSym, kFp8DynamicTokenSym),
            (kFp8Static128BlockSym, None),
            (kFp8Static128BlockSym, kFp8DynamicTokenSym),
            (kInt4Static, None),
            (kInt4Static, kFp8DynamicTokenSym),
            (kInt8Static, None),
            (kInt8Static, kFp8DynamicTokenSym),
        ]
        return (weight_key, activation_key) in SUPPORTED_W_A

    @property
    def expects_unquantized_inputs(self) -> bool:
        """
        Humming kernels handle input quantization internally via
        HummingMethod.may_quant_input() in the apply() method.

        This property tells the prepare/finalize step to skip input
        quantization (by setting defer_input_quant=True) and pass
        unquantized inputs to the experts. This prevents double
        quantization: once in prepare and once in Humming's apply().

        Returns:
            True to indicate that this expert expects unquantized inputs
            and will handle quantization internally.
        """
        return True

    @staticmethod
    def _supports_current_device() -> bool:
        platform = current_platform
        return (
            has_humming()
            and platform.is_cuda()
            and platform.has_device_capability((7, 5))
        )

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return True

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        # Humming uses apply_moe_activation() callback for activation,
        # so any activation supported there can be used here.
        return activation in [
            MoEActivation.SILU,
            MoEActivation.GELU,
            MoEActivation.GELU_TANH,
            MoEActivation.SWIGLUOAI,
            MoEActivation.SWIGLUSTEP,
            MoEActivation.SILU_NO_MUL,
            MoEActivation.GELU_NO_MUL,
            MoEActivation.GELU_TANH_NO_MUL,
            MoEActivation.RELU2_NO_MUL,
        ]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    def moe_problem_size(
        self,
        a1: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> tuple[int, int, int, int, int]:
        from vllm.utils.humming import HummingLayerMeta

        meta1: HummingLayerMeta = self.layer.humming_metas["w13"]
        meta2: HummingLayerMeta = self.layer.humming_metas["w2"]

        assert meta1.num_experts == meta2.num_experts

        num_experts = meta1.num_experts
        top_k = topk_ids.size(1)
        assert w1.size(0) == num_experts
        assert w2.size(0) == num_experts

        if not self.is_batched():
            num_tokens = a1.size(0)
            assert topk_ids.size(0) == num_tokens
        else:
            assert a1.dim() == 3
            assert a1.size(0) == num_experts
            num_tokens = a1.size(1)

        return meta1.num_experts, num_tokens, meta1.shape_n // 2, meta1.shape_k, top_k

    def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
        from vllm.utils.humming import GemmType as HummingGemmType
        from vllm.utils.humming import dtypes

        num_experts = self.num_experts
        N = self.layer.intermediate_size_per_partition
        K = self.layer.hidden_size
        assert isinstance(num_experts, int)
        assert isinstance(N, int)
        assert isinstance(K, int)

        # hidden_states
        # (-> quanted_gate_up_input) (if not BF16/FP16 activation)
        # -> gate_up_output
        # -> activation_output
        # (-> quanted_down_input) (if not BF16/FP16 activation)
        # -> down_output
        # (-> output) (if not is_batched)
        # Neighboring nodes are required to utilize distinct workspaces.
        # The output must be derived from workspace1.

        output_shape: tuple[int, ...]
        if self.is_batched():
            max_num_tokens = self.max_num_tokens
            num_dispatchers = self.num_dispatchers
            assert max_num_tokens is not None and num_dispatchers is not None
            input_shape_m = num_experts * max_num_tokens
            real_shape_m = num_experts * max_num_tokens * num_dispatchers
            output_shape = (num_experts, max_num_tokens * num_dispatchers, K)
        else:
            input_shape_m = M
            if self.humming_gemm_type() != HummingGemmType.INDEXED:
                input_shape_m = M * topk
            real_shape_m = M * topk
            output_shape = (M, K)

        down_input_size = N if activation.is_gated else (N * 2)
        a_dtype = self.layer.humming_metas["w13"].a_dtype
        c_dtype = self.layer.humming_metas["w13"].c_dtype
        num_bits = a_dtype.num_bits
        torch_dtype_map = {
            dtypes.float16: torch.float16,
            dtypes.bfloat16: torch.bfloat16,
            dtypes.float32: torch.float32,
            dtypes.float8e4m3: torch.float8_e4m3fn,
            dtypes.float8e5m2: torch.float8_e5m2,
            dtypes.int8: torch.int8,
            dtypes.int4: torch.uint8,
        }

        buffer_metas = {
            "quanted_gate_up_input": {
                "shape": (input_shape_m, K),
                "dtype": torch_dtype_map[a_dtype],
            },
            "gate_up_output": {
                "shape": (real_shape_m, N * 2),
                "dtype": torch_dtype_map[c_dtype],
            },
            "activation_output": {
                "shape": (real_shape_m, down_input_size),
                "dtype": torch_dtype_map[c_dtype],
            },
            "quanted_down_input": {
                "shape": (real_shape_m, down_input_size),
                "dtype": torch_dtype_map[a_dtype],
            },
            "down_output": {
                "shape": output_shape if self.is_batched() else (real_shape_m, K),
                "dtype": torch_dtype_map[c_dtype],
            },
            "output": {
                "shape": output_shape,
                "dtype": torch_dtype_map[c_dtype],
            },
        }

        for key in buffer_metas:
            meta = buffer_metas[key]
            if "quanted" in key and a_dtype.num_bits == 4:
                last_dim = meta["shape"][-1]
                if last_dim % 2 != 0:
                    raise ValueError(
                        f"Int4 packing requires last dimension to be even, "
                        f"got {last_dim} for buffer '{key}'"
                    )
                meta["shape"] = meta["shape"][:-1] + (last_dim // 2,)

        if num_bits == 16:
            required_buffers = ["gate_up_output", "activation_output", "down_output"]
        else:
            required_buffers = [
                "quanted_gate_up_input",
                "gate_up_output",
                "activation_output",
                "quanted_down_input",
                "down_output",
            ]

        # batched moe use down_output as output
        if not self.is_batched():
            required_buffers.append("output")

        return buffer_metas, required_buffers

    def _workspace_shapes(self, M: int, topk: int, activation: MoEActivation):
        buffer_metas, required_buffers = self.get_buffer_metas(M, topk, activation)

        workspace1_nbytes = 0
        workspace2_nbytes = 0

        for index, name in enumerate(required_buffers[::-1]):
            buffer_meta = buffer_metas[name]
            nelement = math.prod(buffer_meta["shape"])
            nbytes = nelement * buffer_meta["dtype"].itemsize
            if index % 2 == 0:
                workspace1_nbytes = max(workspace1_nbytes, nbytes)
            else:
                workspace2_nbytes = max(workspace2_nbytes, nbytes)

        output_key = "down_output" if self.is_batched() else "output"
        output_shape = buffer_metas[output_key]["shape"]
        elem_size = self.layer.params_dtype.itemsize

        return (
            (workspace1_nbytes // elem_size,),
            (workspace2_nbytes // elem_size,),
            output_shape,
        )

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        return self._workspace_shapes(M, topk, activation)

    def make_workspaces(self, M: int, topk: int, activation: MoEActivation):
        shapes = self._workspace_shapes(M, topk, activation)
        workspace1_shape, workspace2_shape, output_shape = shapes
        torch_dtype = self.layer.params_dtype
        workspace1, workspace2 = current_workspace_manager().get_simultaneous(
            (workspace1_shape, torch_dtype),
            (workspace2_shape, torch_dtype),
        )
        output = _resize_cache(workspace1, output_shape)
        return workspace1, workspace2, output

    def prepare_buffers(
        self,
        workspace1: torch.Tensor,
        workspace2: torch.Tensor,
        M: int,
        topk: int,
        activation: MoEActivation,
    ) -> dict[str, torch.Tensor]:
        buffer_metas, required_buffers = self.get_buffer_metas(M, topk, activation)
        buffers = {}
        for index, name in enumerate(required_buffers[::-1]):
            buffer_meta = buffer_metas[name]
            workspace = workspace1 if index % 2 == 0 else workspace2
            workspace = workspace.view(buffer_meta["dtype"])
            buffers[name] = _resize_cache(workspace, buffer_meta["shape"])

        return buffers

    # Note: apply method is implemented by subclasses following the
    # standard FusedMoEExpertsModular.apply signature

    @staticmethod
    def is_supported_config(
        cls: type[mk.FusedMoEExperts],
        moe_config: FusedMoEConfig,
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
        activation_format: mk.FusedMoEActivationFormat,
    ) -> tuple[bool, str | None]:
        supported, reason = mk.FusedMoEExpertsModular.is_supported_config(
            cls,
            moe_config,
            weight_key,
            activation_key,
            activation_format,
        )

        if supported:
            assert hasattr(cls, "humming_gemm_type")
            gemm_type = cls.humming_gemm_type().value.lower()
            preferred_gemm_type = get_humming_moe_gemm_type()
            if preferred_gemm_type is not None:
                supported = preferred_gemm_type.lower() == gemm_type
                if not supported:
                    reason = (
                        f"preferred gemm type {preferred_gemm_type} != "
                        f"supported gemm type {gemm_type}"
                    )

        return supported, reason

    def apply_activation(
        self,
        activation: MoEActivation,
        output: torch.Tensor,
        input: torch.Tensor,
    ) -> None:
        swiglu_limit = self.quant_config.gemm1_clamp_limit
        if activation == MoEActivation.SILU and swiglu_limit is not None:
            swiglu_limit_func(output=output, input=input, swiglu_limit=swiglu_limit)
        else:
            self.activation(activation=activation, input=input, output=output)

expects_unquantized_inputs property

Humming kernels handle input quantization internally via HummingMethod.may_quant_input() in the apply() method.

This property tells the prepare/finalize step to skip input quantization (by setting defer_input_quant=True) and pass unquantized inputs to the experts. This prevents double quantization: once in prepare and once in Humming's apply().

Returns:

  • bool

    True to indicate that this expert expects unquantized inputs

  • bool

    and will handle quantization internally.

HummingGroupedExperts

Bases: HummingExpertsBase

Methods:

  • apply

    Standard apply implementation for Humming grouped experts.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
class HummingGroupedExperts(HummingExpertsBase):
    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def humming_gemm_type() -> "HummingGemmType":
        from vllm.utils.humming import GemmType as HummingGemmType

        return HummingGemmType.GROUPED_CONTIGUOUS

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ) -> None:
        """
        Standard apply implementation for Humming grouped experts.

        Note: Humming kernels handle weights and quantization internally through
        the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
        The output is written into workspace13 via the buffer management.
        """
        from vllm.utils.humming import HummingMethod

        assert not apply_router_weight_on_input

        valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)

        buffers = self.prepare_buffers(
            workspace13,
            workspace2,
            topk_ids.size(0),
            topk_ids.size(1),
            activation,
        )

        hidden_states, _, expert_first_token_offset, inv_perm, _ = moe_permute(
            hidden_states=hidden_states,
            a1q_scale=None,
            topk_ids=topk_ids,
            n_expert=global_num_experts,
            n_local_expert=self.num_experts,
            expert_map=expert_map,
            scratch=self._get_permute_scratch(),
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=hidden_states,
            quanted_input=buffers.get("quanted_gate_up_input", None),
            sublayer_name="w13",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["gate_up_output"],
            valid_shape_m=valid_shape_m,
            expert_layout=expert_first_token_offset,
            compute_config=self.compute_config_str,
            tuning_config=self.w13_tuning_config_str,
            sublayer_name="w13",
        )

        self.apply_activation(
            activation=activation,
            input=buffers["gate_up_output"],
            output=buffers["activation_output"],
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=buffers["activation_output"],
            quanted_input=buffers.get("quanted_down_input", None),
            sublayer_name="w2",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["down_output"],
            valid_shape_m=valid_shape_m,
            expert_layout=expert_first_token_offset,
            compute_config=self.compute_config_str,
            tuning_config=self.w2_tuning_config_str,
            sublayer_name="w2",
        )

        moe_unpermute(
            out=buffers["output"],
            permuted_hidden_states=buffers["down_output"].view(*topk_ids.shape, -1),
            topk_weights=topk_weights,
            inv_permuted_idx=inv_perm,
            expert_first_token_offset=expert_first_token_offset,
        )

apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input)

Standard apply implementation for Humming grouped experts.

Note: Humming kernels handle weights and quantization internally through the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used. The output is written into workspace13 via the buffer management.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None:
    """
    Standard apply implementation for Humming grouped experts.

    Note: Humming kernels handle weights and quantization internally through
    the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
    The output is written into workspace13 via the buffer management.
    """
    from vllm.utils.humming import HummingMethod

    assert not apply_router_weight_on_input

    valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)

    buffers = self.prepare_buffers(
        workspace13,
        workspace2,
        topk_ids.size(0),
        topk_ids.size(1),
        activation,
    )

    hidden_states, _, expert_first_token_offset, inv_perm, _ = moe_permute(
        hidden_states=hidden_states,
        a1q_scale=None,
        topk_ids=topk_ids,
        n_expert=global_num_experts,
        n_local_expert=self.num_experts,
        expert_map=expert_map,
        scratch=self._get_permute_scratch(),
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=hidden_states,
        quanted_input=buffers.get("quanted_gate_up_input", None),
        sublayer_name="w13",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["gate_up_output"],
        valid_shape_m=valid_shape_m,
        expert_layout=expert_first_token_offset,
        compute_config=self.compute_config_str,
        tuning_config=self.w13_tuning_config_str,
        sublayer_name="w13",
    )

    self.apply_activation(
        activation=activation,
        input=buffers["gate_up_output"],
        output=buffers["activation_output"],
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=buffers["activation_output"],
        quanted_input=buffers.get("quanted_down_input", None),
        sublayer_name="w2",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["down_output"],
        valid_shape_m=valid_shape_m,
        expert_layout=expert_first_token_offset,
        compute_config=self.compute_config_str,
        tuning_config=self.w2_tuning_config_str,
        sublayer_name="w2",
    )

    moe_unpermute(
        out=buffers["output"],
        permuted_hidden_states=buffers["down_output"].view(*topk_ids.shape, -1),
        topk_weights=topk_weights,
        inv_permuted_idx=inv_perm,
        expert_first_token_offset=expert_first_token_offset,
    )

HummingIndexedExperts

Bases: HummingExpertsBase

Methods:

  • apply

    Standard apply implementation for Humming indexed experts.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
class HummingIndexedExperts(HummingExpertsBase):
    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def humming_gemm_type() -> "HummingGemmType":
        from vllm.utils.humming import GemmType as HummingGemmType

        return HummingGemmType.INDEXED

    def prepare_humming_moe_kwargs(
        self,
        topk_ids: torch.Tensor,
        expert_map: torch.Tensor | None,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
    ) -> tuple[dict[str, Any], dict[str, Any]]:
        valid_shape_m = self.estimate_local_valid_shape_m(topk_ids)

        moe_block_size = None
        for min_shape_m, max_shape_m, config in self.w13_tuning_config:
            if valid_shape_m > min_shape_m and valid_shape_m <= max_shape_m:
                moe_block_size = config["block_shape"][0]
                break

        if moe_block_size is None:
            logger.warning_once(
                "No tuning config found for shape %s, using default block_size=64",
                valid_shape_m,
            )
            moe_block_size = 64

        sorted_ids, expert_ids, num_tokens_padded = moe_align_block_size(
            topk_ids=topk_ids,
            block_size=moe_block_size,
            num_experts=self.global_num_experts,
            expert_map=expert_map,
            ignore_invalid_experts=True,
        )

        moe_common_kwargs = {
            "sorted_ids": sorted_ids,
            "expert_ids": expert_ids,
            "num_tokens_padded": num_tokens_padded,
            "compute_config": self.compute_config_str,
            "valid_shape_m": valid_shape_m,
        }

        top_k = topk_ids.size(1)
        moe_kwargs1 = {"top_k": top_k, "tuning_config": self.w13_tuning_config_str}
        moe_kwargs2 = {"top_k": 1, "tuning_config": self.w2_tuning_config_str}
        moe_kwargs1.update(moe_common_kwargs)
        moe_kwargs2.update(moe_common_kwargs)

        return moe_kwargs1, moe_kwargs2

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ) -> None:
        """
        Standard apply implementation for Humming indexed experts.

        Note: Humming kernels handle weights and quantization internally through
        the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
        The output is written into workspace13 via the buffer management.
        """
        from vllm.utils.humming import HummingMethod

        assert not apply_router_weight_on_input

        hidden_states = hidden_states.view(-1, hidden_states.size(-1))
        buffers = self.prepare_buffers(
            workspace13,
            workspace2,
            topk_ids.size(0),
            topk_ids.size(1),
            activation,
        )

        moe_kwargs1, moe_kwargs2 = self.prepare_humming_moe_kwargs(
            topk_ids=topk_ids,
            expert_map=expert_map,
            expert_tokens_meta=expert_tokens_meta,
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=hidden_states,
            quanted_input=buffers.get("quanted_gate_up_input", None),
            sublayer_name="w13",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["gate_up_output"],
            sublayer_name="w13",
            **moe_kwargs1,
        )

        self.apply_activation(
            activation=activation,
            input=buffers["gate_up_output"],
            output=buffers["activation_output"],
        )

        inputs, input_scale = HummingMethod.may_quant_input(
            layer=self.layer,
            inputs=buffers["activation_output"],
            quanted_input=buffers.get("quanted_down_input", None),
            sublayer_name="w2",
        )

        HummingMethod.forward_layer(
            layer=self.layer,
            inputs=inputs,
            input_scale=input_scale,
            outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
            sublayer_name="w2",
            **moe_kwargs2,
        )

        moe_fused_mul_sum(
            inputs=buffers["down_output"].view(*topk_ids.shape, -1),
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            expert_map=expert_map,
            outputs=buffers["output"],
        )

apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input)

Standard apply implementation for Humming indexed experts.

Note: Humming kernels handle weights and quantization internally through the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used. The output is written into workspace13 via the buffer management.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None:
    """
    Standard apply implementation for Humming indexed experts.

    Note: Humming kernels handle weights and quantization internally through
    the layer object, so w1, w2, a1q_scale, a2_scale parameters are not used.
    The output is written into workspace13 via the buffer management.
    """
    from vllm.utils.humming import HummingMethod

    assert not apply_router_weight_on_input

    hidden_states = hidden_states.view(-1, hidden_states.size(-1))
    buffers = self.prepare_buffers(
        workspace13,
        workspace2,
        topk_ids.size(0),
        topk_ids.size(1),
        activation,
    )

    moe_kwargs1, moe_kwargs2 = self.prepare_humming_moe_kwargs(
        topk_ids=topk_ids,
        expert_map=expert_map,
        expert_tokens_meta=expert_tokens_meta,
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=hidden_states,
        quanted_input=buffers.get("quanted_gate_up_input", None),
        sublayer_name="w13",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["gate_up_output"],
        sublayer_name="w13",
        **moe_kwargs1,
    )

    self.apply_activation(
        activation=activation,
        input=buffers["gate_up_output"],
        output=buffers["activation_output"],
    )

    inputs, input_scale = HummingMethod.may_quant_input(
        layer=self.layer,
        inputs=buffers["activation_output"],
        quanted_input=buffers.get("quanted_down_input", None),
        sublayer_name="w2",
    )

    HummingMethod.forward_layer(
        layer=self.layer,
        inputs=inputs,
        input_scale=input_scale,
        outputs=buffers["down_output"].view(-1, hidden_states.size(-1)),
        sublayer_name="w2",
        **moe_kwargs2,
    )

    moe_fused_mul_sum(
        inputs=buffers["down_output"].view(*topk_ids.shape, -1),
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        expert_map=expert_map,
        outputs=buffers["output"],
    )