vllm.model_executor.layers.fused_moe

Modules:

Name	Description
`cutlass_moe`	CUTLASS based Fused MoE kernels.
`deep_gemm_moe`
`fused_batched_moe`	Fused batched MoE kernel.
`fused_marlin_moe`	Fused MoE utilities for GPTQ.
`fused_moe`	Fused MoE kernel.
`layer`
`modular_kernel`
`moe_align_block_size`
`moe_pallas`
`moe_permute_unpermute`
`moe_torch_iterative`
`pplx_prepare_finalize`
`prepare_finalize`
`rocm_aiter_fused_moe`
`triton_deep_gemm_moe`
`utils`

all `module-attribute` ¶

__all__ = [
    "FusedMoE",
    "FusedMoEMethodBase",
    "FusedMoeWeightScaleSupported",
    "override_config",
    "get_config",
]

_config `module-attribute` ¶

_config: Optional[dict[str, Any]] = None

FusedMoE ¶

Bases: Module

FusedMoE layer for MoE models.

This layer contains both MergedColumnParallel weights (gate_up_proj / w13) and RowParallelLinear weights (down_proj/ w2).

Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We copy that naming convention here and handle any remapping in the load_weights function in each model implementation.

Parameters:

Name	Type	Description	Default
`num_experts`	`int`	Number of experts in the model	required
`top_k`	`int`	Number of experts selected for each token	required
`hidden_size`	`int`	Input hidden state size of the transformer	required
`intermediate_size`	`int`	Intermediate size of the experts	required
`params_dtype`	`Optional[dtype]`	Data type for the parameters.	`None`
`reduce_results`	`bool`	Whether to all all_reduce on the output of the layer	`False`
`renomalize`		Whether to renormalize the logits in the fused_moe kernel	required
`quant_config`	`Optional[QuantizationConfig]`	Quantization configure.	`None`

Source code in vllm/model_executor/layers/fused_moe/layer.py

class FusedMoE(torch.nn.Module):
    """FusedMoE layer for MoE models.

    This layer contains both MergedColumnParallel weights (gate_up_proj /
    w13) and RowParallelLinear weights (down_proj/ w2).

    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
    copy that naming convention here and handle any remapping in the
    load_weights function in each model implementation.

    Args:
        num_experts: Number of experts in the model
        top_k: Number of experts selected for each token
        hidden_size: Input hidden state size of the transformer
        intermediate_size: Intermediate size of the experts
        params_dtype: Data type for the parameters.
        reduce_results: Whether to all all_reduce on the output of the layer
        renomalize: Whether to renormalize the logits in the fused_moe kernel
        quant_config: Quantization configure.
    """

    def __init__(
        self,
        num_experts: int,  # Global number of experts
        top_k: int,
        hidden_size: int,
        intermediate_size: int,
        params_dtype: Optional[torch.dtype] = None,
        reduce_results: bool = False,
        renormalize: bool = True,
        use_grouped_topk: bool = False,
        num_expert_group: Optional[int] = None,
        topk_group: Optional[int] = None,
        quant_config: Optional[QuantizationConfig] = None,
        tp_size: Optional[int] = None,
        ep_size: Optional[int] = None,
        dp_size: Optional[int] = None,
        prefix: str = "",
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
    ):
        super().__init__()

        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.params_dtype = params_dtype

        vllm_config = get_current_vllm_config()
        self.moe_parallel_config: FusedMoEParallelConfig = (
            FusedMoEParallelConfig.make(
                tp_size_=(tp_size if tp_size is not None else
                          get_tensor_model_parallel_world_size()),
                dp_size_=(dp_size if dp_size is not None else
                          get_dp_group().world_size),
                vllm_parallel_config=vllm_config.parallel_config))

        self.global_num_experts = num_experts

        # For smuggling this layer into the fused moe custom op
        self.use_direct_call = self.dp_size == 1
        if not self.use_direct_call:
            compilation_config = vllm_config.compilation_config
            if prefix in compilation_config.static_forward_context:
                raise ValueError("Duplicate layer name: {}".format(prefix))
            compilation_config.static_forward_context[prefix] = self
            self.layer_name = prefix

        # Determine expert maps
        if self.use_ep:
            self.local_num_experts, self.expert_map = determine_expert_map(
                ep_size=self.ep_size,
                ep_rank=self.ep_rank,
                global_num_experts=self.global_num_experts)
        else:
            self.local_num_experts, self.expert_map = (self.global_num_experts,
                                                       None)

        self.top_k = top_k

        assert intermediate_size % self.tp_size == 0
        self.hidden_size = hidden_size
        self.intermediate_size_per_partition = intermediate_size // self.tp_size
        self.reduce_results = reduce_results
        self.renormalize = renormalize
        self.use_grouped_topk = use_grouped_topk
        if self.use_grouped_topk:
            assert num_expert_group is not None and topk_group is not None
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.custom_routing_function = custom_routing_function
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias
        self.apply_router_weight_on_input = apply_router_weight_on_input
        self.activation = activation

        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
        if current_platform.is_hpu():
            from vllm_hpu_extension.ops import DynamicFusedMOE
            self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)

        moe = MoEConfig(
            num_experts=self.global_num_experts,
            experts_per_token=top_k,
            hidden_dim=hidden_size,
            num_local_experts=self.local_num_experts,
            moe_parallel_config=self.moe_parallel_config,
            # TODO (bnell): this needs to be fixed for quantized types.
            in_dtype=params_dtype,
            max_num_tokens=MOE_DP_CHUNK_SIZE,
        )
        self.moe_config = moe
        self.quant_config = quant_config

        # Note: get_quant_method will look at the layer's local_num_experts
        # for heuristic purposes, so it must be initialized first.
        quant_method: Optional[QuantizeMethodBase] = None

        if quant_config is None:
            quant_method = UnquantizedFusedMoEMethod(moe)
        else:
            quant_method = quant_config.get_quant_method(self, prefix)

        assert quant_method is not None
        assert isinstance(quant_method, FusedMoEMethodBase)
        self.quant_method = quant_method

        moe_quant_params = {
            "num_experts": self.local_num_experts,
            "hidden_size": hidden_size,
            "intermediate_size_per_partition":
            self.intermediate_size_per_partition,
            "params_dtype": params_dtype,
            "weight_loader": self.weight_loader,
        }
        # need full intermediate size pre-sharding for WNA16 act order
        if (self.quant_method.__class__.__name__
                in ("GPTQMarlinMoEMethod",
                    "CompressedTensorsWNA16MarlinMoEMethod",
                    "CompressedTensorsWNA16MoEMethod")):
            moe_quant_params["intermediate_size_full"] = intermediate_size

        self.quant_method.create_weights(layer=self, **moe_quant_params)

    @property
    def tp_size(self):
        return self.moe_parallel_config.tp_size

    @property
    def dp_size(self):
        return self.moe_parallel_config.dp_size

    @property
    def ep_size(self):
        return self.moe_parallel_config.ep_size

    @property
    def tp_rank(self):
        return self.moe_parallel_config.tp_rank

    @property
    def dp_rank(self):
        return self.moe_parallel_config.dp_rank

    @property
    def ep_rank(self):
        return self.moe_parallel_config.ep_rank

    @property
    def use_ep(self):
        return self.moe_parallel_config.use_ep

    @property
    def use_pplx_kernels(self):
        return self.moe_parallel_config.use_pplx_kernels

    def _load_per_tensor_weight_scale(self, shard_id: str,
                                      param: torch.nn.Parameter,
                                      loaded_weight: torch.Tensor,
                                      expert_id: int):
        param_data = param.data
        # for per tensor weight quantization
        if shard_id in ("w1", "w3"):
            # We have to keep the weight scales of w1 and w3 because
            # we need to re-quantize w1/w3 weights after weight loading.
            idx = 0 if shard_id == "w1" else 1
            param_data[expert_id][idx] = loaded_weight
        # If we are in the row parallel case (down_proj)
        elif shard_id == "w2":
            param_data[expert_id] = loaded_weight

    def _load_model_weight_or_group_weight_scale(self,
                                                 shard_dim: int,
                                                 expert_data: torch.Tensor,
                                                 shard_id: str,
                                                 loaded_weight: torch.Tensor,
                                                 tp_rank: int,
                                                 load_full_w2: bool = False):
        """
        Load grouped weight scales for group quantization or model weights
            :param shard_dim: dimension to shard
            :param expert_data: parameter for a particular expert
            :param shard_id: either w1, w2, or w3
            :param loaded_weight: checkpoint weight to load into the param
            :param tp_rank: tensor parallel rank
            :param load_full_w2: whether or not the w2 loaded should be sharded.
        """
        if shard_id == "w2":
            # In the case where we have actorder/g_idx, we do not partition the
            # w2 scales, as indicated by `load_full` argument, for all tp cases
            self._load_w2(shard_dim=shard_dim,
                          loaded_weight=loaded_weight,
                          expert_data=expert_data,
                          tp_rank=tp_rank,
                          load_full=load_full_w2)
        elif shard_id in ("w1", "w3"):
            self._load_w13(shard_id=shard_id,
                           shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
                           tp_rank=tp_rank)

    def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                       shard_dim: int, shard_id: str,
                                       loaded_weight: torch.Tensor,
                                       tp_rank: int):
        # for per channel weight quantization
        if shard_id == "w2":
            expert_data.copy_(loaded_weight)
        elif shard_id in ("w1", "w3"):
            self._load_w13(shard_id=shard_id,
                           shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
                           tp_rank=tp_rank)

    def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
                  shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):

        # Index the loaded weight for tp sharding.
        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
        shard_size = expert_data.shape[shard_dim] // 2
        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
                                             shard_size)
        # Narrow parameter and load.
        # w1, gate_proj: Load into first logical weight of w13.
        if shard_id == "w1":
            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
        # w3, up_proj: Load into second logical weight of w13.
        else:
            assert shard_id == "w3"
            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
        expert_data.copy_(loaded_weight)

    def _load_w2(self,
                 expert_data: torch.Tensor,
                 shard_dim: int,
                 loaded_weight: torch.Tensor,
                 tp_rank: int,
                 load_full: bool = False):

        # Index the loaded weight for tp sharding.
        # down_proj: "RowParallel" so tp sharding on input_dim
        # Narrow parameter and load.
        shard_size = expert_data.shape[shard_dim]
        if not load_full:
            loaded_weight = loaded_weight.narrow(shard_dim,
                                                 shard_size * tp_rank,
                                                 shard_size)
        # w2, down_proj: Load into only logical weight of w2.
        expert_data.copy_(loaded_weight)

    def _load_single_value(self, param: torch.nn.Parameter,
                           loaded_weight: torch.Tensor, expert_id: int):
        param_data = param.data

        # Input scales can be loaded directly and should be equal.
        param_data[expert_id] = loaded_weight

    def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):

        if shard_id == "w2":
            self._load_w2(shard_dim=shard_dim,
                          loaded_weight=loaded_weight,
                          expert_data=expert_data,
                          tp_rank=tp_rank)
        else:
            assert shard_id in ("w1", "w3")
            expert_data.copy_(loaded_weight)

    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
        if self.expert_map is None:
            return expert_id
        return self.expert_map[expert_id].item()

    def weight_loader(self, param: torch.nn.Parameter,
                      loaded_weight: torch.Tensor, weight_name: str,
                      shard_id: str, expert_id: int) -> None:

        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
        if expert_id == -1:
            return
        quant_method_name = self.quant_method.__class__.__name__
        # compressed-tensors checkpoints with packed weights are stored flipped
        # TODO (mgoin): check self.quant_method.quant_config.quant_format
        # against known CompressionFormat enum values that have this quality
        if self.quant_method.__class__.__name__ in (
                "CompressedTensorsWNA16MarlinMoEMethod",
                "CompressedTensorsWNA16MoEMethod"):
            loaded_weight = loaded_weight.t().contiguous()

        if shard_id not in ("w1", "w2", "w3"):
            raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                             f"got {shard_id}.")

        WEIGHT_SCALE_SUPPORTED = [
            e.value for e in FusedMoeWeightScaleSupported
        ]
        # Fetch the dim to shard the parameter/loaded weight
        # based on the shard id. This will be whatever
        # dimension intermediate_size_per_partition is used.
        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}

        is_gguf_weight = getattr(param, "is_gguf_weight", False)
        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
        if is_gguf_weight_type:
            param.weight_type = loaded_weight.item()
            param.data.copy_(loaded_weight)
            return

        # is_transposed: if the dim to shard the weight
        # should be flipped. Required by GPTQ, compressed-tensors
        # should be whatever dimension intermediate_size_per_partition is
        is_transposed = getattr(param, "is_transposed", False)
        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
        if is_transposed:
            shard_dim = int(not shard_dim)

        full_load = len(loaded_weight.shape) == 3
        if full_load:
            shard_dim += 1

        # Materialize GGUF UninitializedParameter
        if is_gguf_weight and isinstance(param, UninitializedParameter):
            final_shape = list(loaded_weight.shape)
            if shard_id in ["w1", "w3"]:
                final_shape[1] *= 2
            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
            param.materialize(final_shape, dtype=loaded_weight.dtype)

        expert_data = param.data if full_load else param.data[expert_id]
        # Case input scale: input_scale loading is only supported for fp8
        if "input_scale" in weight_name:
            # this is needed for compressed-tensors only
            loaded_weight = loaded_weight.to(param.data.device)

            if ("compressed" in quant_method_name.lower()
                    and param.data[expert_id] != 1
                    and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
                raise ValueError(
                    "input_scales of w1 and w3 of a layer "
                    f"must be equal. But got {param.data[expert_id]} "
                    f"vs. {loaded_weight}")

            self._load_single_value(param=param,
                                    loaded_weight=loaded_weight,
                                    expert_id=expert_id)
            return

        # Case g_idx
        if "g_idx" in weight_name:
            self._load_g_idx(shard_dim=0,
                             shard_id=shard_id,
                             loaded_weight=loaded_weight,
                             expert_data=expert_data,
                             tp_rank=self.tp_rank)
            return

        if "ModelOpt" in quant_method_name:
            if ('weight_scale_2' in weight_name
                    or 'input_scale' in weight_name):
                self._load_per_tensor_weight_scale(shard_id=shard_id,
                                                   param=param,
                                                   loaded_weight=loaded_weight,
                                                   expert_id=expert_id)
            elif "weight" in weight_name:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank)
            return

        # Case weight scales, zero_points and offset
        if ("scale" in weight_name or "zero" in weight_name
                or "offset" in weight_name):
            # load the weight scales and zp based on the quantization scheme
            # supported weight scales/zp can be found in
            # FusedMoeWeightScaleSupported
            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
            # specific to each case
            quant_method = getattr(param, "quant_method", None)
            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
                self._load_per_channel_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank)
            elif quant_method in [
                    FusedMoeWeightScaleSupported.GROUP.value,
                    FusedMoeWeightScaleSupported.BLOCK.value,
            ]:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.tp_rank,
                    load_full_w2=getattr(param, "load_full_w2", False))
            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                self._load_per_tensor_weight_scale(shard_id=shard_id,
                                                   param=param,
                                                   loaded_weight=loaded_weight,
                                                   expert_id=expert_id)
            else:
                raise ValueError(
                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
            return

        # Case weight_shape
        if "weight_shape" in weight_name:
            # only required by compressed-tensors
            self._load_single_value(param=param,
                                    loaded_weight=loaded_weight,
                                    expert_id=expert_id)
            return

        # Case model weights
        if "weight" in weight_name:
            self._load_model_weight_or_group_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank)
            return

    @staticmethod
    def select_experts(hidden_states: torch.Tensor,
                       router_logits: torch.Tensor,
                       top_k: int,
                       use_grouped_topk: bool,
                       renormalize: bool,
                       topk_group: Optional[int] = None,
                       num_expert_group: Optional[int] = None,
                       custom_routing_function: Optional[Callable] = None,
                       scoring_func: str = "softmax",
                       e_score_correction_bias: Optional[torch.Tensor] = None,
                       indices_type: Optional[torch.dtype] = None):
        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk

        # DeekSeekv2 uses grouped_top_k
        if use_grouped_topk:
            assert topk_group is not None
            assert num_expert_group is not None
            topk_weights, topk_ids = grouped_topk(
                hidden_states=hidden_states,
                gating_output=router_logits,
                topk=top_k,
                renormalize=renormalize,
                num_expert_group=num_expert_group,
                topk_group=topk_group,
                scoring_func=scoring_func,
                e_score_correction_bias=e_score_correction_bias)
            if indices_type is not None:
                topk_ids = topk_ids.to(dtype=indices_type)
        elif custom_routing_function is None:
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                hidden_states=hidden_states,
                gating_output=router_logits,
                topk=top_k,
                renormalize=renormalize,
                indices_type=indices_type,
            )
        else:
            topk_weights, topk_ids = custom_routing_function(
                hidden_states=hidden_states,
                gating_output=router_logits,
                topk=top_k,
                renormalize=renormalize)
            if indices_type is not None:
                topk_ids = topk_ids.to(dtype=indices_type)

        return topk_weights, topk_ids

    def must_reduce_shared_expert_outputs(self) -> bool:
        """
        The shared_experts are typically computed using the RowParallelLinear
        layer. The result of this function is typically used as
        the reduce_results argument to the module.
        When just tensor-parallel is used, it is not required to reduce
        the shared_experts results immediately. Instead we reduce at the
        once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
        With EP and the pplx kernels - this is no longer viable as all
        GPU ranks in DP, produce the complete set of hidden_states.
        Therefore it is required that we reduce the shared_experts output
        early.
        """
        return self.use_pplx_kernels

    def maybe_all_reduce_tensor_model_parallel(
            self, final_hidden_states: torch.Tensor):
        """
        The pplx combine kernel reduces across GPU ranks by default.
        """
        if self.use_pplx_kernels:
            return final_hidden_states
        else:
            return tensor_model_parallel_all_reduce(final_hidden_states)

    def forward(self, hidden_states: torch.Tensor,
                router_logits: torch.Tensor):
        if self.use_direct_call:
            return self.forward_impl(hidden_states, router_logits)
        else:
            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
                                              self.layer_name)

    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                             full_router_logits: torch.Tensor):

        full_final_hidden_states = torch.empty_like(full_hidden_states)

        def process_chunk(chunk_start, chunk_end, skip_result_store=False):
            hidden_states = full_hidden_states[chunk_start:chunk_end, :]
            router_logits = full_router_logits[chunk_start:chunk_end, :]

            # Matrix multiply.
            final_hidden_states = self.quant_method.apply(
                layer=self,
                x=hidden_states,
                router_logits=router_logits,
                top_k=self.top_k,
                renormalize=self.renormalize,
                use_grouped_topk=self.use_grouped_topk,
                global_num_experts=self.global_num_experts,
                expert_map=self.expert_map,
                topk_group=self.topk_group,
                num_expert_group=self.num_expert_group,
                custom_routing_function=self.custom_routing_function,
                scoring_func=self.scoring_func,
                e_score_correction_bias=self.e_score_correction_bias,
                activation=self.activation,
            )

            if not skip_result_store:
                full_final_hidden_states[chunk_start:chunk_end, :].copy_(
                    final_hidden_states)

        ctx = get_forward_context()
        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
        moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE

        num_tokens = full_hidden_states.size(0)
        for chunk_start_ in range(0, max_tokens_across_dp,
                                  moe_dp_chunk_size_per_rank):
            chunk_start = chunk_start_
            chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
                            max_tokens_across_dp)
            # clamp start and end
            chunk_start = min(chunk_start, num_tokens - 1)
            chunk_end = min(chunk_end, num_tokens)

            process_chunk(chunk_start,
                          chunk_end,
                          skip_result_store=chunk_start_ >= num_tokens)

        return full_final_hidden_states

    def forward_impl(self, hidden_states: torch.Tensor,
                     router_logits: torch.Tensor):
        assert self.quant_method is not None
        if self.moe_parallel_config.use_pplx_kernels:
            return self.forward_impl_chunked(hidden_states, router_logits)

        if self.dp_size > 1:
            hidden_states, router_logits = get_ep_group().dispatch(
                hidden_states, router_logits)
        # Matrix multiply.
        final_hidden_states = self.quant_method.apply(
            layer=self,
            x=hidden_states,
            router_logits=router_logits,
            top_k=self.top_k,
            renormalize=self.renormalize,
            use_grouped_topk=self.use_grouped_topk,
            global_num_experts=self.global_num_experts,
            expert_map=self.expert_map,
            topk_group=self.topk_group,
            num_expert_group=self.num_expert_group,
            custom_routing_function=self.custom_routing_function,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
            activation=self.activation,
            apply_router_weight_on_input=self.apply_router_weight_on_input,
        )

        if self.dp_size > 1:
            final_hidden_states = get_ep_group().combine(final_hidden_states)

        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
            # Default set to False. (May have to add shared expert outputs.)
            final_hidden_states = tensor_model_parallel_all_reduce(
                final_hidden_states)

        return final_hidden_states

    @classmethod
    def make_expert_params_mapping(
            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
            ckpt_up_proj_name: str,
            num_experts: int) -> list[tuple[str, str, int, str]]:

        return [
            # (param_name, weight_name, expert_id, shard_id)
            ("experts.w13_" if weight_name
             in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
             f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
            for expert_id in range(num_experts) for shard_id, weight_name in [
                ("w1", ckpt_gate_proj_name),
                ("w2", ckpt_down_proj_name),
                ("w3", ckpt_up_proj_name),
            ]
        ]

    def extra_repr(self) -> str:

        s = (
            f"global_num_experts={self.global_num_experts}, "
            f"local_num_experts={self.local_num_experts}, "
            f"top_k={self.top_k}, "
            f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
            f"tp_size={self.tp_size},\n"
            f"ep_size={self.ep_size}, "
            f"reduce_results={self.reduce_results}, "
            f"renormalize={self.renormalize}, "
            f"use_grouped_topk={self.use_grouped_topk}")

        if self.use_grouped_topk:
            s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501

        s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501

        return s

activation `instance-attribute` ¶

activation = activation

apply_router_weight_on_input `instance-attribute` ¶

apply_router_weight_on_input = apply_router_weight_on_input

custom_routing_function `instance-attribute` ¶

custom_routing_function = custom_routing_function

dp_rank `property` ¶

dp_rank

dp_size `property` ¶

dp_size

e_score_correction_bias `instance-attribute` ¶

e_score_correction_bias = e_score_correction_bias

ep_rank `property` ¶

ep_rank

ep_size `property` ¶

ep_size

global_num_experts `instance-attribute` ¶

global_num_experts = num_experts

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

hpu_fused_moe `instance-attribute` ¶

hpu_fused_moe = DynamicFusedMOE(global_num_experts)

intermediate_size_per_partition `instance-attribute` ¶

intermediate_size_per_partition = (
    intermediate_size // tp_size
)

layer_name `instance-attribute` ¶

layer_name = prefix

moe_config `instance-attribute` ¶

moe_config = moe

moe_parallel_config `instance-attribute` ¶

moe_parallel_config: FusedMoEParallelConfig = make(
    tp_size_=tp_size
    if tp_size is not None
    else get_tensor_model_parallel_world_size(),
    dp_size_=dp_size if dp_size is not None else world_size,
    vllm_parallel_config=parallel_config,
)

num_expert_group `instance-attribute` ¶

num_expert_group = num_expert_group

params_dtype `instance-attribute` ¶

params_dtype = params_dtype

quant_config `instance-attribute` ¶

quant_config = quant_config

quant_method `instance-attribute` ¶

quant_method = quant_method

reduce_results `instance-attribute` ¶

reduce_results = reduce_results

renormalize `instance-attribute` ¶

renormalize = renormalize

scoring_func `instance-attribute` ¶

scoring_func = scoring_func

top_k `instance-attribute` ¶

top_k = top_k

topk_group `instance-attribute` ¶

topk_group = topk_group

tp_rank `property` ¶

tp_rank

tp_size `property` ¶

tp_size

use_direct_call `instance-attribute` ¶

use_direct_call = dp_size == 1

use_ep `property` ¶

use_ep

use_grouped_topk `instance-attribute` ¶

use_grouped_topk = use_grouped_topk

use_pplx_kernels `property` ¶

use_pplx_kernels

init ¶

__init__(
    num_experts: int,
    top_k: int,
    hidden_size: int,
    intermediate_size: int,
    params_dtype: Optional[dtype] = None,
    reduce_results: bool = False,
    renormalize: bool = True,
    use_grouped_topk: bool = False,
    num_expert_group: Optional[int] = None,
    topk_group: Optional[int] = None,
    quant_config: Optional[QuantizationConfig] = None,
    tp_size: Optional[int] = None,
    ep_size: Optional[int] = None,
    dp_size: Optional[int] = None,
    prefix: str = "",
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[Tensor] = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def __init__(
    self,
    num_experts: int,  # Global number of experts
    top_k: int,
    hidden_size: int,
    intermediate_size: int,
    params_dtype: Optional[torch.dtype] = None,
    reduce_results: bool = False,
    renormalize: bool = True,
    use_grouped_topk: bool = False,
    num_expert_group: Optional[int] = None,
    topk_group: Optional[int] = None,
    quant_config: Optional[QuantizationConfig] = None,
    tp_size: Optional[int] = None,
    ep_size: Optional[int] = None,
    dp_size: Optional[int] = None,
    prefix: str = "",
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
):
    super().__init__()

    if params_dtype is None:
        params_dtype = torch.get_default_dtype()
    self.params_dtype = params_dtype

    vllm_config = get_current_vllm_config()
    self.moe_parallel_config: FusedMoEParallelConfig = (
        FusedMoEParallelConfig.make(
            tp_size_=(tp_size if tp_size is not None else
                      get_tensor_model_parallel_world_size()),
            dp_size_=(dp_size if dp_size is not None else
                      get_dp_group().world_size),
            vllm_parallel_config=vllm_config.parallel_config))

    self.global_num_experts = num_experts

    # For smuggling this layer into the fused moe custom op
    self.use_direct_call = self.dp_size == 1
    if not self.use_direct_call:
        compilation_config = vllm_config.compilation_config
        if prefix in compilation_config.static_forward_context:
            raise ValueError("Duplicate layer name: {}".format(prefix))
        compilation_config.static_forward_context[prefix] = self
        self.layer_name = prefix

    # Determine expert maps
    if self.use_ep:
        self.local_num_experts, self.expert_map = determine_expert_map(
            ep_size=self.ep_size,
            ep_rank=self.ep_rank,
            global_num_experts=self.global_num_experts)
    else:
        self.local_num_experts, self.expert_map = (self.global_num_experts,
                                                   None)

    self.top_k = top_k

    assert intermediate_size % self.tp_size == 0
    self.hidden_size = hidden_size
    self.intermediate_size_per_partition = intermediate_size // self.tp_size
    self.reduce_results = reduce_results
    self.renormalize = renormalize
    self.use_grouped_topk = use_grouped_topk
    if self.use_grouped_topk:
        assert num_expert_group is not None and topk_group is not None
    self.num_expert_group = num_expert_group
    self.topk_group = topk_group
    self.custom_routing_function = custom_routing_function
    self.scoring_func = scoring_func
    self.e_score_correction_bias = e_score_correction_bias
    self.apply_router_weight_on_input = apply_router_weight_on_input
    self.activation = activation

    if self.scoring_func != "softmax" and not self.use_grouped_topk:
        raise ValueError("Only softmax scoring function is supported for "
                         "non-grouped topk.")
    if current_platform.is_hpu():
        from vllm_hpu_extension.ops import DynamicFusedMOE
        self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)

    moe = MoEConfig(
        num_experts=self.global_num_experts,
        experts_per_token=top_k,
        hidden_dim=hidden_size,
        num_local_experts=self.local_num_experts,
        moe_parallel_config=self.moe_parallel_config,
        # TODO (bnell): this needs to be fixed for quantized types.
        in_dtype=params_dtype,
        max_num_tokens=MOE_DP_CHUNK_SIZE,
    )
    self.moe_config = moe
    self.quant_config = quant_config

    # Note: get_quant_method will look at the layer's local_num_experts
    # for heuristic purposes, so it must be initialized first.
    quant_method: Optional[QuantizeMethodBase] = None

    if quant_config is None:
        quant_method = UnquantizedFusedMoEMethod(moe)
    else:
        quant_method = quant_config.get_quant_method(self, prefix)

    assert quant_method is not None
    assert isinstance(quant_method, FusedMoEMethodBase)
    self.quant_method = quant_method

    moe_quant_params = {
        "num_experts": self.local_num_experts,
        "hidden_size": hidden_size,
        "intermediate_size_per_partition":
        self.intermediate_size_per_partition,
        "params_dtype": params_dtype,
        "weight_loader": self.weight_loader,
    }
    # need full intermediate size pre-sharding for WNA16 act order
    if (self.quant_method.__class__.__name__
            in ("GPTQMarlinMoEMethod",
                "CompressedTensorsWNA16MarlinMoEMethod",
                "CompressedTensorsWNA16MoEMethod")):
        moe_quant_params["intermediate_size_full"] = intermediate_size

    self.quant_method.create_weights(layer=self, **moe_quant_params)

_load_g_idx ¶

_load_g_idx(
    shard_id: str,
    expert_data: Tensor,
    shard_dim: int,
    loaded_weight: Tensor,
    tp_rank: int,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
                shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):

    if shard_id == "w2":
        self._load_w2(shard_dim=shard_dim,
                      loaded_weight=loaded_weight,
                      expert_data=expert_data,
                      tp_rank=tp_rank)
    else:
        assert shard_id in ("w1", "w3")
        expert_data.copy_(loaded_weight)

_load_model_weight_or_group_weight_scale ¶

_load_model_weight_or_group_weight_scale(
    shard_dim: int,
    expert_data: Tensor,
    shard_id: str,
    loaded_weight: Tensor,
    tp_rank: int,
    load_full_w2: bool = False,
)

Load grouped weight scales for group quantization or model weights :param shard_dim: dimension to shard :param expert_data: parameter for a particular expert :param shard_id: either w1, w2, or w3 :param loaded_weight: checkpoint weight to load into the param :param tp_rank: tensor parallel rank :param load_full_w2: whether or not the w2 loaded should be sharded.

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_model_weight_or_group_weight_scale(self,
                                             shard_dim: int,
                                             expert_data: torch.Tensor,
                                             shard_id: str,
                                             loaded_weight: torch.Tensor,
                                             tp_rank: int,
                                             load_full_w2: bool = False):
    """
    Load grouped weight scales for group quantization or model weights
        :param shard_dim: dimension to shard
        :param expert_data: parameter for a particular expert
        :param shard_id: either w1, w2, or w3
        :param loaded_weight: checkpoint weight to load into the param
        :param tp_rank: tensor parallel rank
        :param load_full_w2: whether or not the w2 loaded should be sharded.
    """
    if shard_id == "w2":
        # In the case where we have actorder/g_idx, we do not partition the
        # w2 scales, as indicated by `load_full` argument, for all tp cases
        self._load_w2(shard_dim=shard_dim,
                      loaded_weight=loaded_weight,
                      expert_data=expert_data,
                      tp_rank=tp_rank,
                      load_full=load_full_w2)
    elif shard_id in ("w1", "w3"):
        self._load_w13(shard_id=shard_id,
                       shard_dim=shard_dim,
                       loaded_weight=loaded_weight,
                       expert_data=expert_data,
                       tp_rank=tp_rank)

_load_per_channel_weight_scale ¶

_load_per_channel_weight_scale(
    expert_data: Tensor,
    shard_dim: int,
    shard_id: str,
    loaded_weight: Tensor,
    tp_rank: int,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                   shard_dim: int, shard_id: str,
                                   loaded_weight: torch.Tensor,
                                   tp_rank: int):
    # for per channel weight quantization
    if shard_id == "w2":
        expert_data.copy_(loaded_weight)
    elif shard_id in ("w1", "w3"):
        self._load_w13(shard_id=shard_id,
                       shard_dim=shard_dim,
                       loaded_weight=loaded_weight,
                       expert_data=expert_data,
                       tp_rank=tp_rank)

_load_per_tensor_weight_scale ¶

_load_per_tensor_weight_scale(
    shard_id: str,
    param: Parameter,
    loaded_weight: Tensor,
    expert_id: int,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_per_tensor_weight_scale(self, shard_id: str,
                                  param: torch.nn.Parameter,
                                  loaded_weight: torch.Tensor,
                                  expert_id: int):
    param_data = param.data
    # for per tensor weight quantization
    if shard_id in ("w1", "w3"):
        # We have to keep the weight scales of w1 and w3 because
        # we need to re-quantize w1/w3 weights after weight loading.
        idx = 0 if shard_id == "w1" else 1
        param_data[expert_id][idx] = loaded_weight
    # If we are in the row parallel case (down_proj)
    elif shard_id == "w2":
        param_data[expert_id] = loaded_weight

_load_single_value ¶

_load_single_value(
    param: Parameter, loaded_weight: Tensor, expert_id: int
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_single_value(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, expert_id: int):
    param_data = param.data

    # Input scales can be loaded directly and should be equal.
    param_data[expert_id] = loaded_weight

_load_w13 ¶

_load_w13(
    expert_data: Tensor,
    shard_dim: int,
    shard_id: str,
    loaded_weight: Tensor,
    tp_rank: int,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
              shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):

    # Index the loaded weight for tp sharding.
    # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
    shard_size = expert_data.shape[shard_dim] // 2
    loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
                                         shard_size)
    # Narrow parameter and load.
    # w1, gate_proj: Load into first logical weight of w13.
    if shard_id == "w1":
        expert_data = expert_data.narrow(shard_dim, 0, shard_size)
    # w3, up_proj: Load into second logical weight of w13.
    else:
        assert shard_id == "w3"
        expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
    expert_data.copy_(loaded_weight)

_load_w2 ¶

_load_w2(
    expert_data: Tensor,
    shard_dim: int,
    loaded_weight: Tensor,
    tp_rank: int,
    load_full: bool = False,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _load_w2(self,
             expert_data: torch.Tensor,
             shard_dim: int,
             loaded_weight: torch.Tensor,
             tp_rank: int,
             load_full: bool = False):

    # Index the loaded weight for tp sharding.
    # down_proj: "RowParallel" so tp sharding on input_dim
    # Narrow parameter and load.
    shard_size = expert_data.shape[shard_dim]
    if not load_full:
        loaded_weight = loaded_weight.narrow(shard_dim,
                                             shard_size * tp_rank,
                                             shard_size)
    # w2, down_proj: Load into only logical weight of w2.
    expert_data.copy_(loaded_weight)

_map_global_expert_id_to_local_expert_id ¶

_map_global_expert_id_to_local_expert_id(
    expert_id: int,
) -> int

Source code in vllm/model_executor/layers/fused_moe/layer.py

def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
    if self.expert_map is None:
        return expert_id
    return self.expert_map[expert_id].item()

extra_repr ¶

extra_repr() -> str

Source code in vllm/model_executor/layers/fused_moe/layer.py

def extra_repr(self) -> str:

    s = (
        f"global_num_experts={self.global_num_experts}, "
        f"local_num_experts={self.local_num_experts}, "
        f"top_k={self.top_k}, "
        f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
        f"tp_size={self.tp_size},\n"
        f"ep_size={self.ep_size}, "
        f"reduce_results={self.reduce_results}, "
        f"renormalize={self.renormalize}, "
        f"use_grouped_topk={self.use_grouped_topk}")

    if self.use_grouped_topk:
        s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501

    s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501

    return s

forward ¶

forward(hidden_states: Tensor, router_logits: Tensor)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def forward(self, hidden_states: torch.Tensor,
            router_logits: torch.Tensor):
    if self.use_direct_call:
        return self.forward_impl(hidden_states, router_logits)
    else:
        return torch.ops.vllm.moe_forward(hidden_states, router_logits,
                                          self.layer_name)

forward_impl ¶

forward_impl(hidden_states: Tensor, router_logits: Tensor)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def forward_impl(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
    assert self.quant_method is not None
    if self.moe_parallel_config.use_pplx_kernels:
        return self.forward_impl_chunked(hidden_states, router_logits)

    if self.dp_size > 1:
        hidden_states, router_logits = get_ep_group().dispatch(
            hidden_states, router_logits)
    # Matrix multiply.
    final_hidden_states = self.quant_method.apply(
        layer=self,
        x=hidden_states,
        router_logits=router_logits,
        top_k=self.top_k,
        renormalize=self.renormalize,
        use_grouped_topk=self.use_grouped_topk,
        global_num_experts=self.global_num_experts,
        expert_map=self.expert_map,
        topk_group=self.topk_group,
        num_expert_group=self.num_expert_group,
        custom_routing_function=self.custom_routing_function,
        scoring_func=self.scoring_func,
        e_score_correction_bias=self.e_score_correction_bias,
        activation=self.activation,
        apply_router_weight_on_input=self.apply_router_weight_on_input,
    )

    if self.dp_size > 1:
        final_hidden_states = get_ep_group().combine(final_hidden_states)

    if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
        # Default set to False. (May have to add shared expert outputs.)
        final_hidden_states = tensor_model_parallel_all_reduce(
            final_hidden_states)

    return final_hidden_states

forward_impl_chunked ¶

forward_impl_chunked(
    full_hidden_states: Tensor, full_router_logits: Tensor
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                         full_router_logits: torch.Tensor):

    full_final_hidden_states = torch.empty_like(full_hidden_states)

    def process_chunk(chunk_start, chunk_end, skip_result_store=False):
        hidden_states = full_hidden_states[chunk_start:chunk_end, :]
        router_logits = full_router_logits[chunk_start:chunk_end, :]

        # Matrix multiply.
        final_hidden_states = self.quant_method.apply(
            layer=self,
            x=hidden_states,
            router_logits=router_logits,
            top_k=self.top_k,
            renormalize=self.renormalize,
            use_grouped_topk=self.use_grouped_topk,
            global_num_experts=self.global_num_experts,
            expert_map=self.expert_map,
            topk_group=self.topk_group,
            num_expert_group=self.num_expert_group,
            custom_routing_function=self.custom_routing_function,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
            activation=self.activation,
        )

        if not skip_result_store:
            full_final_hidden_states[chunk_start:chunk_end, :].copy_(
                final_hidden_states)

    ctx = get_forward_context()
    max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
    moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE

    num_tokens = full_hidden_states.size(0)
    for chunk_start_ in range(0, max_tokens_across_dp,
                              moe_dp_chunk_size_per_rank):
        chunk_start = chunk_start_
        chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
                        max_tokens_across_dp)
        # clamp start and end
        chunk_start = min(chunk_start, num_tokens - 1)
        chunk_end = min(chunk_end, num_tokens)

        process_chunk(chunk_start,
                      chunk_end,
                      skip_result_store=chunk_start_ >= num_tokens)

    return full_final_hidden_states

make_expert_params_mapping `classmethod` ¶

make_expert_params_mapping(
    ckpt_gate_proj_name: str,
    ckpt_down_proj_name: str,
    ckpt_up_proj_name: str,
    num_experts: int,
) -> list[tuple[str, str, int, str]]

Source code in vllm/model_executor/layers/fused_moe/layer.py

@classmethod
def make_expert_params_mapping(
        cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
        ckpt_up_proj_name: str,
        num_experts: int) -> list[tuple[str, str, int, str]]:

    return [
        # (param_name, weight_name, expert_id, shard_id)
        ("experts.w13_" if weight_name
         in [ckpt_gate_proj_name, ckpt_up_proj_name] else "experts.w2_",
         f"experts.{expert_id}.{weight_name}.", expert_id, shard_id)
        for expert_id in range(num_experts) for shard_id, weight_name in [
            ("w1", ckpt_gate_proj_name),
            ("w2", ckpt_down_proj_name),
            ("w3", ckpt_up_proj_name),
        ]
    ]

maybe_all_reduce_tensor_model_parallel ¶

maybe_all_reduce_tensor_model_parallel(
    final_hidden_states: Tensor,
)

The pplx combine kernel reduces across GPU ranks by default.

Source code in vllm/model_executor/layers/fused_moe/layer.py

def maybe_all_reduce_tensor_model_parallel(
        self, final_hidden_states: torch.Tensor):
    """
    The pplx combine kernel reduces across GPU ranks by default.
    """
    if self.use_pplx_kernels:
        return final_hidden_states
    else:
        return tensor_model_parallel_all_reduce(final_hidden_states)

must_reduce_shared_expert_outputs ¶

must_reduce_shared_expert_outputs() -> bool

The shared_experts are typically computed using the RowParallelLinear layer. The result of this function is typically used as the reduce_results argument to the module. When just tensor-parallel is used, it is not required to reduce the shared_experts results immediately. Instead we reduce at the once at the end of the MoE op. (Refer to DeepSeekV2MoE module) With EP and the pplx kernels - this is no longer viable as all GPU ranks in DP, produce the complete set of hidden_states. Therefore it is required that we reduce the shared_experts output early.

Source code in vllm/model_executor/layers/fused_moe/layer.py

def must_reduce_shared_expert_outputs(self) -> bool:
    """
    The shared_experts are typically computed using the RowParallelLinear
    layer. The result of this function is typically used as
    the reduce_results argument to the module.
    When just tensor-parallel is used, it is not required to reduce
    the shared_experts results immediately. Instead we reduce at the
    once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
    With EP and the pplx kernels - this is no longer viable as all
    GPU ranks in DP, produce the complete set of hidden_states.
    Therefore it is required that we reduce the shared_experts output
    early.
    """
    return self.use_pplx_kernels

select_experts `staticmethod` ¶

select_experts(
    hidden_states: Tensor,
    router_logits: Tensor,
    top_k: int,
    use_grouped_topk: bool,
    renormalize: bool,
    topk_group: Optional[int] = None,
    num_expert_group: Optional[int] = None,
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[Tensor] = None,
    indices_type: Optional[dtype] = None,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

@staticmethod
def select_experts(hidden_states: torch.Tensor,
                   router_logits: torch.Tensor,
                   top_k: int,
                   use_grouped_topk: bool,
                   renormalize: bool,
                   topk_group: Optional[int] = None,
                   num_expert_group: Optional[int] = None,
                   custom_routing_function: Optional[Callable] = None,
                   scoring_func: str = "softmax",
                   e_score_correction_bias: Optional[torch.Tensor] = None,
                   indices_type: Optional[torch.dtype] = None):
    from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk

    # DeekSeekv2 uses grouped_top_k
    if use_grouped_topk:
        assert topk_group is not None
        assert num_expert_group is not None
        topk_weights, topk_ids = grouped_topk(
            hidden_states=hidden_states,
            gating_output=router_logits,
            topk=top_k,
            renormalize=renormalize,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias)
        if indices_type is not None:
            topk_ids = topk_ids.to(dtype=indices_type)
    elif custom_routing_function is None:
        topk_weights, topk_ids, token_expert_indices = fused_topk(
            hidden_states=hidden_states,
            gating_output=router_logits,
            topk=top_k,
            renormalize=renormalize,
            indices_type=indices_type,
        )
    else:
        topk_weights, topk_ids = custom_routing_function(
            hidden_states=hidden_states,
            gating_output=router_logits,
            topk=top_k,
            renormalize=renormalize)
        if indices_type is not None:
            topk_ids = topk_ids.to(dtype=indices_type)

    return topk_weights, topk_ids

weight_loader ¶

weight_loader(
    param: Parameter,
    loaded_weight: Tensor,
    weight_name: str,
    shard_id: str,
    expert_id: int,
) -> None

Source code in vllm/model_executor/layers/fused_moe/layer.py

def weight_loader(self, param: torch.nn.Parameter,
                  loaded_weight: torch.Tensor, weight_name: str,
                  shard_id: str, expert_id: int) -> None:

    expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
    if expert_id == -1:
        return
    quant_method_name = self.quant_method.__class__.__name__
    # compressed-tensors checkpoints with packed weights are stored flipped
    # TODO (mgoin): check self.quant_method.quant_config.quant_format
    # against known CompressionFormat enum values that have this quality
    if self.quant_method.__class__.__name__ in (
            "CompressedTensorsWNA16MarlinMoEMethod",
            "CompressedTensorsWNA16MoEMethod"):
        loaded_weight = loaded_weight.t().contiguous()

    if shard_id not in ("w1", "w2", "w3"):
        raise ValueError(f"shard_id must be ['w1','w2','w3'] but "
                         f"got {shard_id}.")

    WEIGHT_SCALE_SUPPORTED = [
        e.value for e in FusedMoeWeightScaleSupported
    ]
    # Fetch the dim to shard the parameter/loaded weight
    # based on the shard id. This will be whatever
    # dimension intermediate_size_per_partition is used.
    SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}

    is_gguf_weight = getattr(param, "is_gguf_weight", False)
    is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
    if is_gguf_weight_type:
        param.weight_type = loaded_weight.item()
        param.data.copy_(loaded_weight)
        return

    # is_transposed: if the dim to shard the weight
    # should be flipped. Required by GPTQ, compressed-tensors
    # should be whatever dimension intermediate_size_per_partition is
    is_transposed = getattr(param, "is_transposed", False)
    shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
    if is_transposed:
        shard_dim = int(not shard_dim)

    full_load = len(loaded_weight.shape) == 3
    if full_load:
        shard_dim += 1

    # Materialize GGUF UninitializedParameter
    if is_gguf_weight and isinstance(param, UninitializedParameter):
        final_shape = list(loaded_weight.shape)
        if shard_id in ["w1", "w3"]:
            final_shape[1] *= 2
        final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
        param.materialize(final_shape, dtype=loaded_weight.dtype)

    expert_data = param.data if full_load else param.data[expert_id]
    # Case input scale: input_scale loading is only supported for fp8
    if "input_scale" in weight_name:
        # this is needed for compressed-tensors only
        loaded_weight = loaded_weight.to(param.data.device)

        if ("compressed" in quant_method_name.lower()
                and param.data[expert_id] != 1
                and (param.data[expert_id] - loaded_weight).abs() > 1e-5):
            raise ValueError(
                "input_scales of w1 and w3 of a layer "
                f"must be equal. But got {param.data[expert_id]} "
                f"vs. {loaded_weight}")

        self._load_single_value(param=param,
                                loaded_weight=loaded_weight,
                                expert_id=expert_id)
        return

    # Case g_idx
    if "g_idx" in weight_name:
        self._load_g_idx(shard_dim=0,
                         shard_id=shard_id,
                         loaded_weight=loaded_weight,
                         expert_data=expert_data,
                         tp_rank=self.tp_rank)
        return

    if "ModelOpt" in quant_method_name:
        if ('weight_scale_2' in weight_name
                or 'input_scale' in weight_name):
            self._load_per_tensor_weight_scale(shard_id=shard_id,
                                               param=param,
                                               loaded_weight=loaded_weight,
                                               expert_id=expert_id)
        elif "weight" in weight_name:
            self._load_model_weight_or_group_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank)
        return

    # Case weight scales, zero_points and offset
    if ("scale" in weight_name or "zero" in weight_name
            or "offset" in weight_name):
        # load the weight scales and zp based on the quantization scheme
        # supported weight scales/zp can be found in
        # FusedMoeWeightScaleSupported
        # TODO @dsikka: once hardened, refactor to use vLLM Parameters
        # specific to each case
        quant_method = getattr(param, "quant_method", None)
        if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
            self._load_per_channel_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank)
        elif quant_method in [
                FusedMoeWeightScaleSupported.GROUP.value,
                FusedMoeWeightScaleSupported.BLOCK.value,
        ]:
            self._load_model_weight_or_group_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.tp_rank,
                load_full_w2=getattr(param, "load_full_w2", False))
        elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
            self._load_per_tensor_weight_scale(shard_id=shard_id,
                                               param=param,
                                               loaded_weight=loaded_weight,
                                               expert_id=expert_id)
        else:
            raise ValueError(
                f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}")
        return

    # Case weight_shape
    if "weight_shape" in weight_name:
        # only required by compressed-tensors
        self._load_single_value(param=param,
                                loaded_weight=loaded_weight,
                                expert_id=expert_id)
        return

    # Case model weights
    if "weight" in weight_name:
        self._load_model_weight_or_group_weight_scale(
            shard_id=shard_id,
            shard_dim=shard_dim,
            loaded_weight=loaded_weight,
            expert_data=expert_data,
            tp_rank=self.tp_rank)
        return

FusedMoEMethodBase ¶

Bases: QuantizeMethodBase

Source code in vllm/model_executor/layers/fused_moe/layer.py

class FusedMoEMethodBase(QuantizeMethodBase):

    @abstractmethod
    def create_weights(self, layer: torch.nn.Module, num_experts: int,
                       hidden_size: int, intermediate_size_per_partition: int,
                       params_dtype: torch.dtype, **extra_weight_attrs):
        raise NotImplementedError

    def init_prepare_finalize(self, moe: MoEConfig,
                              quant_config: Optional[QuantizationConfig]):
        all2all_manager = get_ep_group().device_communicator.all2all_manager
        assert all2all_manager is not None

        prepare_finalize = None
        if moe.use_pplx_kernels:
            all_to_all_args = dict(
                max_num_tokens=moe.max_num_tokens,
                num_experts=moe.num_experts,
                experts_per_token=moe.experts_per_token,  # topk
                rank=all2all_manager.rank,
                world_size=all2all_manager.world_size,
                # dp_size actually means tp_size, bug in pplx kernels
                dp_size=all2all_manager.tp_group.world_size,
                hidden_dim=moe.hidden_dim,
                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
                # For blocked per token: set to
                #   ceil_div(hidden_dim, block_size) * sizeof(float32)
                # For per-token: set to sizeof(float32)
                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                    torch.float32.itemsize)),
                group_name=all2all_manager.cpu_group.group_name,
            )

            handle = all2all_manager.get_handle(all_to_all_args)

            prepare_finalize = PplxPrepareAndFinalize(
                handle,
                max_num_tokens=moe.max_num_tokens,
                world_size=all2all_manager.world_size,
                rank=all2all_manager.rank,
                # dp_size actually means tp_size, bug in pplx kernels
                dp_size=all2all_manager.tp_group.world_size,
                quant_dtype=moe.in_dtype,
            )

        if prepare_finalize is not None:
            experts = self.select_gemm_impl(prepare_finalize)
            self.fused_experts = FusedMoEModularKernel(
                prepare_finalize,
                experts,
            )

    def select_gemm_impl(
        self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
    ) -> FusedMoEPermuteExpertsUnpermute:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise NotImplementedError(
            "Subclass must select appropriate gemm implementation"
            " based on the prepare_finalize")

    @abstractmethod
    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
    ) -> torch.Tensor:
        raise NotImplementedError

apply `abstractmethod` ¶

apply(
    layer: Module,
    x: Tensor,
    router_logits: Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: Optional[int] = None,
    num_expert_group: Optional[int] = None,
    global_num_experts: int = -1,
    expert_map: Optional[Tensor] = None,
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[Tensor] = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
) -> Tensor

Source code in vllm/model_executor/layers/fused_moe/layer.py

@abstractmethod
def apply(
    self,
    layer: torch.nn.Module,
    x: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    renormalize: bool,
    use_grouped_topk: bool = False,
    topk_group: Optional[int] = None,
    num_expert_group: Optional[int] = None,
    global_num_experts: int = -1,
    expert_map: Optional[torch.Tensor] = None,
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
) -> torch.Tensor:
    raise NotImplementedError

create_weights `abstractmethod` ¶

create_weights(
    layer: Module,
    num_experts: int,
    hidden_size: int,
    intermediate_size_per_partition: int,
    params_dtype: dtype,
    **extra_weight_attrs,
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

@abstractmethod
def create_weights(self, layer: torch.nn.Module, num_experts: int,
                   hidden_size: int, intermediate_size_per_partition: int,
                   params_dtype: torch.dtype, **extra_weight_attrs):
    raise NotImplementedError

init_prepare_finalize ¶

init_prepare_finalize(
    moe: MoEConfig,
    quant_config: Optional[QuantizationConfig],
)

Source code in vllm/model_executor/layers/fused_moe/layer.py

def init_prepare_finalize(self, moe: MoEConfig,
                          quant_config: Optional[QuantizationConfig]):
    all2all_manager = get_ep_group().device_communicator.all2all_manager
    assert all2all_manager is not None

    prepare_finalize = None
    if moe.use_pplx_kernels:
        all_to_all_args = dict(
            max_num_tokens=moe.max_num_tokens,
            num_experts=moe.num_experts,
            experts_per_token=moe.experts_per_token,  # topk
            rank=all2all_manager.rank,
            world_size=all2all_manager.world_size,
            # dp_size actually means tp_size, bug in pplx kernels
            dp_size=all2all_manager.tp_group.world_size,
            hidden_dim=moe.hidden_dim,
            hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
            # For blocked per token: set to
            #   ceil_div(hidden_dim, block_size) * sizeof(float32)
            # For per-token: set to sizeof(float32)
            hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
                (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                torch.float32.itemsize)),
            group_name=all2all_manager.cpu_group.group_name,
        )

        handle = all2all_manager.get_handle(all_to_all_args)

        prepare_finalize = PplxPrepareAndFinalize(
            handle,
            max_num_tokens=moe.max_num_tokens,
            world_size=all2all_manager.world_size,
            rank=all2all_manager.rank,
            # dp_size actually means tp_size, bug in pplx kernels
            dp_size=all2all_manager.tp_group.world_size,
            quant_dtype=moe.in_dtype,
        )

    if prepare_finalize is not None:
        experts = self.select_gemm_impl(prepare_finalize)
        self.fused_experts = FusedMoEModularKernel(
            prepare_finalize,
            experts,
        )

select_gemm_impl ¶

select_gemm_impl(
    prepare_finalize: Optional[FusedMoEPrepareAndFinalize],
) -> FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/layer.py

def select_gemm_impl(
    self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
) -> FusedMoEPermuteExpertsUnpermute:
    # based on the all2all implementation, select the appropriate
    # gemm implementation
    raise NotImplementedError(
        "Subclass must select appropriate gemm implementation"
        " based on the prepare_finalize")

FusedMoeWeightScaleSupported ¶

Bases: Enum

Source code in vllm/model_executor/layers/fused_moe/layer.py

class FusedMoeWeightScaleSupported(Enum):
    TENSOR = "tensor"
    CHANNEL = "channel"
    GROUP = "group"
    BLOCK = "block"

BLOCK `class-attribute` `instance-attribute` ¶

BLOCK = 'block'

CHANNEL `class-attribute` `instance-attribute` ¶

CHANNEL = 'channel'

GROUP `class-attribute` `instance-attribute` ¶

GROUP = 'group'

TENSOR `class-attribute` `instance-attribute` ¶

TENSOR = 'tensor'

TritonExperts ¶

Bases: FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):

    def __init__(
        self,
        use_fp8_w8a8: bool,
        use_int8_w8a8: bool,
        use_int8_w8a16: bool,
        use_int4_w4a16: bool,
        per_channel_quant: bool,
        block_shape: Optional[list[int]] = None,
        block_m: Optional[int] = None,
    ):
        super().__init__()
        self.use_fp8_w8a8 = use_fp8_w8a8
        self.use_int4_w4a16 = use_int4_w4a16
        self.use_int8_w8a8 = use_int8_w8a8
        self.use_int8_w8a16 = use_int8_w8a16
        self.block_shape = block_shape
        self.block_m = block_m
        self.qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8,
                                      use_int8_w8a8=use_int8_w8a8,
                                      use_int8_w8a16=use_int8_w8a16,
                                      use_int4_w4a16=use_int4_w4a16)
        self.per_channel_quant = per_channel_quant

    def workspace_shapes(
        self,
        a: torch.Tensor,
        M: int,
        N: int,
        K: int,
        topk: int,
        num_experts: int,
    ) -> tuple[int, int, torch.dtype]:
        factor = num_experts if a.dim() == 3 else 1
        workspace1 = M * topk * max(N * 2, K) * factor
        workspace2 = M * topk * N * factor
        return (workspace1, workspace2, a.dtype)

    def apply(
        self,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: str,
        global_num_experts: int,
        expert_map: Optional[torch.Tensor],
        w1_scale: Optional[torch.Tensor],
        w2_scale: Optional[torch.Tensor],
        w1_zp: Optional[torch.Tensor],
        w2_zp: Optional[torch.Tensor],
        a1q_scale: Optional[torch.Tensor],
        a2_scale: Optional[torch.Tensor],
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_num_tokens: Optional[torch.Tensor],
    ) -> torch.Tensor:
        # Check constraints.
        if self.use_int4_w4a16:
            assert hidden_states.size(-1) // 2 == w1.size(2), (
                "Hidden size mismatch")
        else:
            assert hidden_states.size(-1) == w1.size(2), \
                (f"Hidden size mismatch {hidden_states.size(-1)} "
                 f"!= {w1.size(2)}")

        assert hidden_states.is_contiguous(
        ), "Hidden_states must be contiguous"
        assert hidden_states.dim() == 2
        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
        assert hidden_states.dtype in [
            torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
        ]

        E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
            hidden_states, w1, w2, topk_ids)

        if global_num_experts == -1:
            global_num_experts = E

        config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
                                            use_int8_w8a16=self.use_int8_w8a16,
                                            use_int4_w4a16=self.use_int4_w4a16,
                                            dtype=hidden_states.dtype)

        config = try_get_optimal_moe_config(
            w1.shape,
            w2.shape,
            top_k_num,
            config_dtype,
            num_tokens,
            block_shape=self.block_shape,
        )

        if hidden_states.dtype == torch.bfloat16:
            compute_type = tl.bfloat16
        elif hidden_states.dtype == torch.float16:
            compute_type = tl.float16
        elif hidden_states.dtype == torch.float32:
            compute_type = tl.float32
        elif hidden_states.dtype == torch.float8_e4m3fn:
            compute_type = tl.bfloat16
        else:
            raise ValueError(
                f"Unsupported compute_type: {hidden_states.dtype}")

        # We can reuse the memory between these because by the time we need
        # cache3, we're done with cache1
        intermediate_cache1 = _resize_cache(workspace13,
                                            (num_tokens, top_k_num, N))
        intermediate_cache2 = _resize_cache(workspace2,
                                            (num_tokens * top_k_num, N // 2))
        intermediate_cache3 = _resize_cache(workspace13,
                                            (num_tokens, top_k_num, K))

        sorted_token_ids, expert_ids, num_tokens_post_padded = (
            moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
                                 global_num_experts, expert_map))

        invoke_fused_moe_kernel(hidden_states,
                                w1,
                                intermediate_cache1,
                                a1q_scale,
                                w1_scale,
                                w1_zp,
                                None,
                                sorted_token_ids,
                                expert_ids,
                                num_tokens_post_padded,
                                False,
                                top_k_num,
                                config,
                                compute_type=compute_type,
                                use_fp8_w8a8=self.use_fp8_w8a8,
                                use_int8_w8a8=self.use_int8_w8a8,
                                use_int8_w8a16=self.use_int8_w8a16,
                                use_int4_w4a16=self.use_int4_w4a16,
                                per_channel_quant=self.per_channel_quant,
                                block_shape=self.block_shape)

        self.activation(activation, intermediate_cache2,
                        intermediate_cache1.view(-1, N))

        a2q_scale: Optional[torch.Tensor] = None

        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
            intermediate_cache2, a2_scale, self.qtype, self.per_channel_quant,
            self.block_shape)

        invoke_fused_moe_kernel(qintermediate_cache2,
                                w2,
                                intermediate_cache3,
                                a2q_scale,
                                w2_scale,
                                w2_zp,
                                None,
                                sorted_token_ids,
                                expert_ids,
                                num_tokens_post_padded,
                                False,
                                1,
                                config,
                                compute_type=compute_type,
                                use_fp8_w8a8=self.use_fp8_w8a8,
                                use_int8_w8a8=self.use_int8_w8a8,
                                use_int8_w8a16=self.use_int8_w8a16,
                                use_int4_w4a16=self.use_int4_w4a16,
                                per_channel_quant=self.per_channel_quant,
                                block_shape=self.block_shape)

        return intermediate_cache3

block_m `instance-attribute` ¶

block_m = block_m

block_shape `instance-attribute` ¶

block_shape = block_shape

per_channel_quant `instance-attribute` ¶

per_channel_quant = per_channel_quant

qtype `instance-attribute` ¶

qtype = get_config_qtype(
    use_fp8_w8a8=use_fp8_w8a8,
    use_int8_w8a8=use_int8_w8a8,
    use_int8_w8a16=use_int8_w8a16,
    use_int4_w4a16=use_int4_w4a16,
)

use_fp8_w8a8 `instance-attribute` ¶

use_fp8_w8a8 = use_fp8_w8a8

use_int4_w4a16 `instance-attribute` ¶

use_int4_w4a16 = use_int4_w4a16

use_int8_w8a16 `instance-attribute` ¶

use_int8_w8a16 = use_int8_w8a16

use_int8_w8a8 `instance-attribute` ¶

use_int8_w8a8 = use_int8_w8a8

init ¶

__init__(
    use_fp8_w8a8: bool,
    use_int8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
    per_channel_quant: bool,
    block_shape: Optional[list[int]] = None,
    block_m: Optional[int] = None,
)

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def __init__(
    self,
    use_fp8_w8a8: bool,
    use_int8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
    per_channel_quant: bool,
    block_shape: Optional[list[int]] = None,
    block_m: Optional[int] = None,
):
    super().__init__()
    self.use_fp8_w8a8 = use_fp8_w8a8
    self.use_int4_w4a16 = use_int4_w4a16
    self.use_int8_w8a8 = use_int8_w8a8
    self.use_int8_w8a16 = use_int8_w8a16
    self.block_shape = block_shape
    self.block_m = block_m
    self.qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8,
                                  use_int8_w8a8=use_int8_w8a8,
                                  use_int8_w8a16=use_int8_w8a16,
                                  use_int4_w4a16=use_int4_w4a16)
    self.per_channel_quant = per_channel_quant

apply ¶

apply(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_ids: Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: Optional[Tensor],
    w1_scale: Optional[Tensor],
    w2_scale: Optional[Tensor],
    w1_zp: Optional[Tensor],
    w2_zp: Optional[Tensor],
    a1q_scale: Optional[Tensor],
    a2_scale: Optional[Tensor],
    workspace13: Tensor,
    workspace2: Tensor,
    expert_num_tokens: Optional[Tensor],
) -> Tensor

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def apply(
    self,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: Optional[torch.Tensor],
    w1_scale: Optional[torch.Tensor],
    w2_scale: Optional[torch.Tensor],
    w1_zp: Optional[torch.Tensor],
    w2_zp: Optional[torch.Tensor],
    a1q_scale: Optional[torch.Tensor],
    a2_scale: Optional[torch.Tensor],
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_num_tokens: Optional[torch.Tensor],
) -> torch.Tensor:
    # Check constraints.
    if self.use_int4_w4a16:
        assert hidden_states.size(-1) // 2 == w1.size(2), (
            "Hidden size mismatch")
    else:
        assert hidden_states.size(-1) == w1.size(2), \
            (f"Hidden size mismatch {hidden_states.size(-1)} "
             f"!= {w1.size(2)}")

    assert hidden_states.is_contiguous(
    ), "Hidden_states must be contiguous"
    assert hidden_states.dim() == 2
    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
    assert hidden_states.dtype in [
        torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
    ]

    E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
        hidden_states, w1, w2, topk_ids)

    if global_num_experts == -1:
        global_num_experts = E

    config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
                                        use_int8_w8a16=self.use_int8_w8a16,
                                        use_int4_w4a16=self.use_int4_w4a16,
                                        dtype=hidden_states.dtype)

    config = try_get_optimal_moe_config(
        w1.shape,
        w2.shape,
        top_k_num,
        config_dtype,
        num_tokens,
        block_shape=self.block_shape,
    )

    if hidden_states.dtype == torch.bfloat16:
        compute_type = tl.bfloat16
    elif hidden_states.dtype == torch.float16:
        compute_type = tl.float16
    elif hidden_states.dtype == torch.float32:
        compute_type = tl.float32
    elif hidden_states.dtype == torch.float8_e4m3fn:
        compute_type = tl.bfloat16
    else:
        raise ValueError(
            f"Unsupported compute_type: {hidden_states.dtype}")

    # We can reuse the memory between these because by the time we need
    # cache3, we're done with cache1
    intermediate_cache1 = _resize_cache(workspace13,
                                        (num_tokens, top_k_num, N))
    intermediate_cache2 = _resize_cache(workspace2,
                                        (num_tokens * top_k_num, N // 2))
    intermediate_cache3 = _resize_cache(workspace13,
                                        (num_tokens, top_k_num, K))

    sorted_token_ids, expert_ids, num_tokens_post_padded = (
        moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
                             global_num_experts, expert_map))

    invoke_fused_moe_kernel(hidden_states,
                            w1,
                            intermediate_cache1,
                            a1q_scale,
                            w1_scale,
                            w1_zp,
                            None,
                            sorted_token_ids,
                            expert_ids,
                            num_tokens_post_padded,
                            False,
                            top_k_num,
                            config,
                            compute_type=compute_type,
                            use_fp8_w8a8=self.use_fp8_w8a8,
                            use_int8_w8a8=self.use_int8_w8a8,
                            use_int8_w8a16=self.use_int8_w8a16,
                            use_int4_w4a16=self.use_int4_w4a16,
                            per_channel_quant=self.per_channel_quant,
                            block_shape=self.block_shape)

    self.activation(activation, intermediate_cache2,
                    intermediate_cache1.view(-1, N))

    a2q_scale: Optional[torch.Tensor] = None

    qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
        intermediate_cache2, a2_scale, self.qtype, self.per_channel_quant,
        self.block_shape)

    invoke_fused_moe_kernel(qintermediate_cache2,
                            w2,
                            intermediate_cache3,
                            a2q_scale,
                            w2_scale,
                            w2_zp,
                            None,
                            sorted_token_ids,
                            expert_ids,
                            num_tokens_post_padded,
                            False,
                            1,
                            config,
                            compute_type=compute_type,
                            use_fp8_w8a8=self.use_fp8_w8a8,
                            use_int8_w8a8=self.use_int8_w8a8,
                            use_int8_w8a16=self.use_int8_w8a16,
                            use_int4_w4a16=self.use_int4_w4a16,
                            per_channel_quant=self.per_channel_quant,
                            block_shape=self.block_shape)

    return intermediate_cache3

workspace_shapes ¶

workspace_shapes(
    a: Tensor,
    M: int,
    N: int,
    K: int,
    topk: int,
    num_experts: int,
) -> tuple[int, int, dtype]

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def workspace_shapes(
    self,
    a: torch.Tensor,
    M: int,
    N: int,
    K: int,
    topk: int,
    num_experts: int,
) -> tuple[int, int, torch.dtype]:
    factor = num_experts if a.dim() == 3 else 1
    workspace1 = M * topk * max(N * 2, K) * factor
    workspace2 = M * topk * N * factor
    return (workspace1, workspace2, a.dtype)

cutlass_moe_fp4 ¶

cutlass_moe_fp4(
    a: Tensor,
    a1_gscale: Tensor,
    w1_fp4: Tensor,
    w1_blockscale: Tensor,
    w1_alphas: Tensor,
    a2_gscale: Tensor,
    w2_fp4: Tensor,
    w2_blockscale: Tensor,
    w2_alphas: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    m: int,
    n: int,
    k: int,
    e: int,
    device: device,
)

MoE implementation for FP4 Inputs

Gemm 1¶

a: Input tensor: [m, k] (half/bfloat16) a1_gscale: Activation scale per expert: [e] (float32) w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k] w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1) (Note: n is the up projection output dim, k is the input dim in full precision) w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3) (Block size = 16 for NVFP4)

Gemm 2¶

a2_gscale: Activation scale per expert: [e] w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n] w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1) w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3

topk_weights: [m, topk] dtype: float8 topk_ids: [m, topk] dtype: float8

m, n, k: Unquantized weight shapes, dtype: int e: number of experts, dtype: int

assumes that topk < k < n to satisfy - up/down projection expectations.

Source code in vllm/model_executor/layers/fused_moe/cutlass_moe.py

def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
                    w1_fp4: torch.Tensor, w1_blockscale: torch.Tensor,
                    w1_alphas: torch.Tensor, a2_gscale: torch.Tensor,
                    w2_fp4: torch.Tensor, w2_blockscale: torch.Tensor,
                    w2_alphas: torch.Tensor, topk_weights: torch.Tensor,
                    topk_ids: torch.Tensor, m: int, n: int, k: int, e: int,
                    device: torch.device):
    """
    MoE implementation for FP4 Inputs

    # Gemm 1
    a: Input tensor: [m, k] (half/bfloat16)
    a1_gscale: Activation scale per expert: [e]  (float32)
    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
    (Note: `n` is the up projection output dim, `k` is the input dim in
     full precision)
    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
                   (Block size = 16 for NVFP4)

    # Gemm 2
    a2_gscale: Activation scale per expert: [e]
    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3

    topk_weights: [m, topk] dtype: float8
    topk_ids: [m, topk] dtype: float8

    m, n, k: Unquantized weight shapes, dtype: int
    e: number of experts, dtype: int

    assumes that topk < k < n to satisfy - up/down projection expectations.
    """
    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
    assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
    assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
    assert (w1_fp4.ndim == 3 and w2_fp4.ndim == 3 and w1_blockscale.ndim == 3
            and w2_blockscale.ndim
            == 3), ("All Weights must be of rank 3 for cutlass_moe_fp4")
    m_a, k_a = a.shape
    e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
    e_w2, k_w2, half_n_w2 = w2_fp4.shape

    assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match",
                                          " between weights.")
    assert (k_a // 2 == half_k_w1
            and k == k_w2), ("Hidden size mismatch between a, w1 and w2")
    assert (nx2_w1 == n * 2 and half_n_w2 == n // 2), ("mismatch in "
                                                       "expected `n`")
    assert (m == m_a), "input shape mismatch"
    assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
    assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
    assert (topk_weights.shape[0] == m and topk_ids.shape[0]
            == m), ("topk must be provided for each row of a")

    out_dtype = a.dtype
    num_topk = topk_ids.shape[1]

    expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device)
    # Problem size:  (num_experts, (m,2n,k))
    problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device)
    # Problem size:  (num_experts, (m,n,k))
    problem_sizes2 = torch.empty((e, 3), dtype=torch.int32, device=device)

    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)

    # problem shapes should have [m, n, k]
    # Note that problem sizes are based on logical number of elements.
    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
                                problem_sizes2, a_map, c_map, e, n, k)

    tokens_per_expert = problem_sizes1[:, 0]
    rounded_tokens_per_expert = (tokens_per_expert + (128 - 1)) // 128 * 128
    blockscale_offsets = torch.zeros(e + 1, dtype=torch.int32, device=device)
    blockscale_offsets[1:] = torch.cumsum(rounded_tokens_per_expert, dim=0)

    rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant(
        a,
        a1_gscale,
        expert_offsets,
        blockscale_offsets,
        num_topk,
        expert_map=a_map)

    c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale,
                                w1_blockscale, w1_alphas, problem_sizes1,
                                expert_offsets[:-1], blockscale_offsets[:-1],
                                out_dtype, device)
    del rep_a_fp4, rep_a_blockscale
    # hidden size dimension is split to one halfpytho sized tensor.
    intermediate = torch.empty((m * num_topk, w1_fp4.shape[1] // 2),
                               device=device,
                               dtype=out_dtype)

    torch.ops._C.silu_and_mul(intermediate, c1)

    int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
        intermediate, a2_gscale, expert_offsets, blockscale_offsets, num_topk)

    c2 = ops.cutlass_fp4_moe_mm(int_fp4, w2_fp4, int_blockscale, w2_blockscale,
                                w2_alphas, problem_sizes2, expert_offsets[:-1],
                                blockscale_offsets[:-1], out_dtype, device)
    del int_fp4, int_blockscale
    out = (c2[c_map].view(m, num_topk, k) *
           topk_weights.view(m, num_topk, 1).half()).sum(dim=1)
    return out.to(dtype=out_dtype)

cutlass_moe_fp8 ¶

cutlass_moe_fp8(
    a: Tensor,
    w1_q: Tensor,
    w2_q: Tensor,
    w1_scale: Tensor,
    w2_scale: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    ab_strides1: Tensor,
    c_strides1: Tensor,
    ab_strides2: Tensor,
    c_strides2: Tensor,
    a1_scale: Optional[Tensor] = None,
    a2_scale: Optional[Tensor] = None,
    out_dtype: dtype = half,
    expert_map: Optional[Tensor] = None,
    apply_router_weight_on_input: bool = False,
) -> Tensor

This function computes a a8w8-quantized Mixture of Experts (MoE) layer using two sets of quantized weights, w1_q and w2_q, and top-k gating mechanism. The matrix multiplications are implemented with CUTLASS grouped gemm.

a (torch.Tensor): The input tensor to the MoE layer. Shape: [M, K]
w1_q (torch.Tensor): The first set of fp8-quantized expert weights. Shape: [num_experts, K, 2N] (the weights are passed transposed)
w2_q (torch.Tensor): The second set of fp8-quantized expert weights. Shape: [num_experts, N, K] (the weights are passed transposed)
w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q. Shape: [num_experts] or [num_experts, 2N]
w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. Shape: [num_experts] or [num_experts, K]
gating_output (torch.Tensor): The output of the gating operation (before softmax).
topk_weights (torch.Tensor): The weights of each token->expert mapping.
ab_strides1 (torch.Tensor): The input and weights strides of the first grouped gemm.
c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
ab_strides2 (torch.Tensor): The input and weights strides of the second grouped gemm.
c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. Shape: scalar or [M]
a2_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize the intermediate result between the gemms. Shape: scalar or [M]
out_dtype (torch.dtype): The output tensor type.
expert_map (Optional[torch.Tensor]): In the case of Expert parallel, every Rank is responsible for a subset of experts. expert_map is a mapping from global expert-id to local expert-id. When expert_map[i] is -1, it means that this Rank is not responsible for global expert-id i.
apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1.

Returns: - torch.Tensor: The fp16 output tensor after applying the MoE layer.

Source code in vllm/model_executor/layers/fused_moe/cutlass_moe.py

def cutlass_moe_fp8(
    a: torch.Tensor,
    w1_q: torch.Tensor,
    w2_q: torch.Tensor,
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    ab_strides1: torch.Tensor,
    c_strides1: torch.Tensor,
    ab_strides2: torch.Tensor,
    c_strides2: torch.Tensor,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
    out_dtype: torch.dtype = torch.half,
    expert_map: Optional[torch.Tensor] = None,
    apply_router_weight_on_input: bool = False,
) -> torch.Tensor:
    """
    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
    using two sets of quantized weights, w1_q and w2_q, and top-k gating
    mechanism. The matrix multiplications are implemented with CUTLASS
    grouped gemm.

    Parameters:
    - a (torch.Tensor): The input tensor to the MoE layer.
        Shape: [M, K]
    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
        Shape: [num_experts, K, 2N] (the weights are passed transposed)
    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
        Shape: [num_experts, N, K] (the weights are passed transposed)
    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
        Shape: [num_experts] or [num_experts, 2N]
    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
        Shape: [num_experts] or [num_experts, K]
    - gating_output (torch.Tensor): The output of the gating operation
        (before softmax).
    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
    - ab_strides1 (torch.Tensor): The input and weights strides of the first
        grouped gemm.
    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
    - ab_strides2 (torch.Tensor): The input and weights strides of the second
        grouped gemm.
    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
        Shape: scalar or [M]
    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
        quantize the intermediate result between the gemms.
        Shape: scalar or [M]
    - out_dtype (torch.dtype): The output tensor type.
    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
        every Rank is responsible for a subset of experts. expert_map is a
        mapping from global expert-id to local expert-id. When expert_map[i]
        is -1, it means that this Rank is not responsible for global
        expert-id i.
    - apply_router_weight_on_input (bool): When true, the topk weights are
        applied directly on the inputs. This is only applicable when topk is 1.

    Returns:
    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
    """
    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
        a2_scale.numel() != 1 if a2_scale is not None else False)

    fn = mk.FusedMoEModularKernel(
        MoEPrepareAndFinalizeNoEP(
            per_channel_quant=per_act_token,
            quant_dtype=torch.float8_e4m3fn,
        ),
        CutlassExpertsFp8(
            ab_strides1,
            c_strides1,
            ab_strides2,
            c_strides2,
            out_dtype,
        ),
    )

    return fn(
        a,
        w1_q,
        w2_q,
        topk_weights,
        topk_ids,
        expert_map=expert_map,
        w1_scale=w1_scale,
        w2_scale=w2_scale,
        a1_scale=a1_scale,
        a2_scale=a2_scale,
        apply_router_weight_on_input=apply_router_weight_on_input,
    )

fused_experts ¶

fused_experts(
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    inplace: bool = False,
    activation: str = "silu",
    apply_router_weight_on_input: bool = False,
    use_fp8_w8a8: bool = False,
    use_int8_w8a8: bool = False,
    use_int8_w8a16: bool = False,
    use_int4_w4a16: bool = False,
    per_channel_quant: bool = False,
    global_num_experts: int = -1,
    expert_map: Optional[Tensor] = None,
    w1_scale: Optional[Tensor] = None,
    w2_scale: Optional[Tensor] = None,
    w1_zp: Optional[Tensor] = None,
    w2_zp: Optional[Tensor] = None,
    a1_scale: Optional[Tensor] = None,
    a2_scale: Optional[Tensor] = None,
    block_shape: Optional[list[int]] = None,
    allow_deep_gemm: bool = False,
) -> Tensor

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def fused_experts(hidden_states: torch.Tensor,
                  w1: torch.Tensor,
                  w2: torch.Tensor,
                  topk_weights: torch.Tensor,
                  topk_ids: torch.Tensor,
                  inplace: bool = False,
                  activation: str = "silu",
                  apply_router_weight_on_input: bool = False,
                  use_fp8_w8a8: bool = False,
                  use_int8_w8a8: bool = False,
                  use_int8_w8a16: bool = False,
                  use_int4_w4a16: bool = False,
                  per_channel_quant: bool = False,
                  global_num_experts: int = -1,
                  expert_map: Optional[torch.Tensor] = None,
                  w1_scale: Optional[torch.Tensor] = None,
                  w2_scale: Optional[torch.Tensor] = None,
                  w1_zp: Optional[torch.Tensor] = None,
                  w2_zp: Optional[torch.Tensor] = None,
                  a1_scale: Optional[torch.Tensor] = None,
                  a2_scale: Optional[torch.Tensor] = None,
                  block_shape: Optional[list[int]] = None,
                  allow_deep_gemm: bool = False) -> torch.Tensor:
    # For now, disable DeepGemm for small N (<= 512) until better
    # permute/unpermute ops are available.
    N = w1.shape[1]
    if (allow_deep_gemm and use_fp8_w8a8 and N > 512
            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
        assert apply_router_weight_on_input is False
        return deep_gemm_moe_fp8(
            hidden_states=hidden_states,
            w1=w1,
            w2=w2,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=inplace,
            activation=activation,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            apply_router_weight_on_input=apply_router_weight_on_input,
        )
    else:
        return dispatch_fused_experts_func(inplace)(
            hidden_states=hidden_states,
            w1=w1,
            w2=w2,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            activation=activation,
            apply_router_weight_on_input=apply_router_weight_on_input,
            use_fp8_w8a8=use_fp8_w8a8,
            use_int8_w8a8=use_int8_w8a8,
            use_int8_w8a16=use_int8_w8a16,
            use_int4_w4a16=use_int4_w4a16,
            per_channel_quant=per_channel_quant,
            global_num_experts=global_num_experts,
            expert_map=expert_map,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            w1_zp=w1_zp,
            w2_zp=w2_zp,
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            block_shape=block_shape)

fused_topk ¶

fused_topk(
    hidden_states: Tensor,
    gating_output: Tensor,
    topk: int,
    renormalize: bool,
    indices_type: Optional[dtype] = None,
) -> tuple[Tensor, Tensor, Tensor]

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def fused_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    indices_type: Optional[torch.dtype] = None,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    assert hidden_states.shape[0] == gating_output.shape[0], (
        "Number of tokens mismatch")

    M, _ = hidden_states.shape

    topk_weights = torch.empty(M,
                               topk,
                               dtype=torch.float32,
                               device=hidden_states.device)
    topk_ids = torch.empty(
        M,
        topk,
        dtype=torch.int32 if indices_type is None else indices_type,
        device=hidden_states.device)
    token_expert_indices = torch.empty(M,
                                       topk,
                                       dtype=torch.int32,
                                       device=hidden_states.device)

    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.

    topk_func = dispatch_topk_func()
    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
                                       token_expert_indices,
                                       gating_output_float, renormalize)

    return topk_weights, topk_ids, token_expert_indices

get_config ¶

get_config() -> Optional[dict[str, Any]]

Source code in vllm/model_executor/layers/fused_moe/__init__.py

def get_config() -> Optional[dict[str, Any]]:
    return _config

get_config_file_name ¶

get_config_file_name(
    E: int,
    N: int,
    dtype: Optional[str],
    block_shape: Optional[list[int]] = None,
) -> str

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

def get_config_file_name(E: int,
                         N: int,
                         dtype: Optional[str],
                         block_shape: Optional[list[int]] = None) -> str:
    device_name = current_platform.get_device_name().replace(" ", "_")
    dtype_selector = "" if not dtype else f",dtype={dtype}"
    block_shape_selector = ("" if not block_shape or not all(block_shape) else
                            f",block_shape={block_shape}").replace(" ", "")
    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501

grouped_topk ¶

grouped_topk(
    hidden_states: Tensor,
    gating_output: Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[Tensor] = None,
) -> tuple[Tensor, Tensor]

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py

@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
def grouped_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None
) -> tuple[torch.Tensor, torch.Tensor]:

    assert hidden_states.shape[0] == gating_output.shape[0], (
        "Number of tokens mismatch")

    if scoring_func == "softmax":
        scores = torch.softmax(gating_output, dim=-1)
    elif scoring_func == "sigmoid":
        scores = gating_output.sigmoid()
    else:
        raise ValueError(f"Unsupported scoring function: {scoring_func}")

    num_token = scores.shape[0]
    if e_score_correction_bias is not None:
        # Store original scores before applying correction bias. We use biased
        # scores for expert selection but original scores for routing weights
        original_scores = scores
        scores = scores + e_score_correction_bias.unsqueeze(0)
        group_scores = (scores.view(num_token, num_expert_group,
                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
    else:
        group_scores = scores.view(num_token, num_expert_group,
                                   -1).max(dim=-1).values  # [n, n_group]
    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
                           sorted=False)[1]  # [n, top_k_group]
    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
    score_mask = group_mask.unsqueeze(-1).expand(
        num_token, num_expert_group,
        scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
    tmp_scores = scores.masked_fill(~score_mask.bool(),
                                    float("-inf"))  # [n, e]

    if e_score_correction_bias is not None:
        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
        # Use original unbiased scores for the routing weights
        topk_weights = original_scores.gather(1, topk_ids)
    else:
        topk_weights, topk_ids = torch.topk(tmp_scores,
                                            k=topk,
                                            dim=-1,
                                            sorted=False)

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)

override_config ¶

override_config(config)

Source code in vllm/model_executor/layers/fused_moe/__init__.py

@contextmanager
def override_config(config):
    global _config
    old_config = _config
    _config = config
    yield
    _config = old_config

vllm.model_executor.layers.fused_moe

__all__ module-attribute ¶

_config module-attribute ¶

FusedMoE ¶

activation instance-attribute ¶

apply_router_weight_on_input instance-attribute ¶

custom_routing_function instance-attribute ¶

dp_rank property ¶

dp_size property ¶

e_score_correction_bias instance-attribute ¶

ep_rank property ¶

ep_size property ¶

global_num_experts instance-attribute ¶

hidden_size instance-attribute ¶

hpu_fused_moe instance-attribute ¶

intermediate_size_per_partition instance-attribute ¶

layer_name instance-attribute ¶

moe_config instance-attribute ¶

moe_parallel_config instance-attribute ¶

num_expert_group instance-attribute ¶

params_dtype instance-attribute ¶

quant_config instance-attribute ¶

quant_method instance-attribute ¶

reduce_results instance-attribute ¶

renormalize instance-attribute ¶

scoring_func instance-attribute ¶

top_k instance-attribute ¶

topk_group instance-attribute ¶

tp_rank property ¶

tp_size property ¶

use_direct_call instance-attribute ¶

use_ep property ¶

use_grouped_topk instance-attribute ¶

use_pplx_kernels property ¶

__init__ ¶

_load_g_idx ¶

_load_model_weight_or_group_weight_scale ¶

_load_per_channel_weight_scale ¶

_load_per_tensor_weight_scale ¶

_load_single_value ¶

_load_w13 ¶

_load_w2 ¶

_map_global_expert_id_to_local_expert_id ¶

extra_repr ¶

forward ¶

forward_impl ¶

forward_impl_chunked ¶

make_expert_params_mapping classmethod ¶

maybe_all_reduce_tensor_model_parallel ¶

must_reduce_shared_expert_outputs ¶

select_experts staticmethod ¶

weight_loader ¶

FusedMoEMethodBase ¶

apply abstractmethod ¶

create_weights abstractmethod ¶

init_prepare_finalize ¶

select_gemm_impl ¶

FusedMoeWeightScaleSupported ¶

BLOCK class-attribute instance-attribute ¶

CHANNEL class-attribute instance-attribute ¶

GROUP class-attribute instance-attribute ¶

TENSOR class-attribute instance-attribute ¶

TritonExperts ¶

block_m instance-attribute ¶

block_shape instance-attribute ¶

per_channel_quant instance-attribute ¶

qtype instance-attribute ¶

use_fp8_w8a8 instance-attribute ¶

use_int4_w4a16 instance-attribute ¶

use_int8_w8a16 instance-attribute ¶

use_int8_w8a8 instance-attribute ¶

__init__ ¶

apply ¶

workspace_shapes ¶

cutlass_moe_fp4 ¶

Gemm 1¶

Gemm 2¶

cutlass_moe_fp8 ¶

fused_experts ¶

fused_topk ¶

all `module-attribute` ¶

_config `module-attribute` ¶

activation `instance-attribute` ¶

apply_router_weight_on_input `instance-attribute` ¶

custom_routing_function `instance-attribute` ¶

dp_rank `property` ¶

dp_size `property` ¶

e_score_correction_bias `instance-attribute` ¶

ep_rank `property` ¶

ep_size `property` ¶

global_num_experts `instance-attribute` ¶

hidden_size `instance-attribute` ¶

hpu_fused_moe `instance-attribute` ¶

intermediate_size_per_partition `instance-attribute` ¶

layer_name `instance-attribute` ¶

moe_config `instance-attribute` ¶

moe_parallel_config `instance-attribute` ¶

num_expert_group `instance-attribute` ¶

params_dtype `instance-attribute` ¶

quant_config `instance-attribute` ¶

quant_method `instance-attribute` ¶

reduce_results `instance-attribute` ¶

renormalize `instance-attribute` ¶

scoring_func `instance-attribute` ¶

top_k `instance-attribute` ¶

topk_group `instance-attribute` ¶

tp_rank `property` ¶

tp_size `property` ¶

use_direct_call `instance-attribute` ¶

use_ep `property` ¶

use_grouped_topk `instance-attribute` ¶

use_pplx_kernels `property` ¶

init ¶

make_expert_params_mapping `classmethod` ¶

select_experts `staticmethod` ¶

apply `abstractmethod` ¶

create_weights `abstractmethod` ¶

BLOCK `class-attribute` `instance-attribute` ¶

CHANNEL `class-attribute` `instance-attribute` ¶

GROUP `class-attribute` `instance-attribute` ¶

TENSOR `class-attribute` `instance-attribute` ¶

block_m `instance-attribute` ¶

block_shape `instance-attribute` ¶

per_channel_quant `instance-attribute` ¶

qtype `instance-attribute` ¶

use_fp8_w8a8 `instance-attribute` ¶

use_int4_w4a16 `instance-attribute` ¶

use_int8_w8a16 `instance-attribute` ¶

use_int8_w8a8 `instance-attribute` ¶

init ¶