`vllm.models.deepseek_v32.nvidia.kernels` ¶

Functions:

fused_eh_norm –

Returns cat([enorm(masked embeds), hnorm(prev_hidden)]) -> [N, 2H].
fused_q –

Fuse the MQA-query and indexer-query RoPE/quantization.

`_fp8_ue8m0_quantize(vals)` ¶

Quantize float32 values to FP8 E4M3 with a ue8m0 (power-of-2) scale.

Returns (fp8_vals, scale) so the caller can store them or reuse the scale.

Source code in vllm/models/deepseek_v32/nvidia/kernels.py

@triton.jit
def _fp8_ue8m0_quantize(vals):
    """Quantize float32 values to FP8 E4M3 with a ue8m0 (power-of-2) scale.

    Returns (fp8_vals, scale) so the caller can store them or reuse the scale.
    """
    vals = vals.to(tl.float32)
    amax = tl.max(tl.abs(vals))
    scale = tl.div_rn(tl.maximum(amax, 1e-4), 448.0)
    scale = tl.math.exp2(tl.math.ceil(tl.math.log2(scale)))
    fp8_vals = tl.div_rn(vals, scale).to(tl.float8e4nv)
    return fp8_vals, scale

`_fused_eh_norm_kernel(pos_ptr, embeds_ptr, embeds_stride, prev_ptr, prev_stride, enorm_w_ptr, hnorm_w_ptr, eps, out_ptr, out_stride, H, BLOCK)` ¶

MTP input fusion: zero embeds at position 0, RMSNorm(embeds) with enorm and RMSNorm(prev_hidden) with hnorm, written side-by-side into out ([N, 2H]) ready for the eh_proj GEMM. Replaces where + 2x RMSNorm + cat.

Source code in vllm/models/deepseek_v32/nvidia/kernels.py

@triton.jit
def _fused_eh_norm_kernel(
    pos_ptr,
    embeds_ptr,
    embeds_stride,
    prev_ptr,
    prev_stride,
    enorm_w_ptr,
    hnorm_w_ptr,
    eps,
    out_ptr,
    out_stride,
    H: tl.constexpr,
    BLOCK: tl.constexpr,
):
    """MTP input fusion: zero embeds at position 0, RMSNorm(embeds) with enorm
    and RMSNorm(prev_hidden) with hnorm, written side-by-side into ``out``
    ([N, 2H]) ready for the eh_proj GEMM. Replaces where + 2x RMSNorm + cat."""
    tok = tl.program_id(0)
    off = tl.arange(0, BLOCK)
    mask = off < H

    pos = tl.load(pos_ptr + tok)
    e = tl.load(embeds_ptr + tok * embeds_stride + off, mask=mask, other=0.0)
    e = tl.where(pos == 0, 0.0, e.to(tl.float32))
    ew = tl.load(enorm_w_ptr + off, mask=mask)
    e_normed = _rms_norm(e, ew, eps, H)
    tl.store(out_ptr + tok * out_stride + off, e_normed, mask=mask)

    p = tl.load(prev_ptr + tok * prev_stride + off, mask=mask, other=0.0)
    hw = tl.load(hnorm_w_ptr + off, mask=mask)
    p_normed = _rms_norm(p, hw, eps, H)
    tl.store(out_ptr + tok * out_stride + H + off, p_normed, mask=mask)

`fused_eh_norm(positions, inputs_embeds, previous_hidden, enorm_w, hnorm_w, eps)` ¶

Returns cat([enorm(masked embeds), hnorm(prev_hidden)]) -> [N, 2H].

Source code in vllm/models/deepseek_v32/nvidia/kernels.py

def fused_eh_norm(
    positions: torch.Tensor,
    inputs_embeds: torch.Tensor,
    previous_hidden: torch.Tensor,
    enorm_w: torch.Tensor,
    hnorm_w: torch.Tensor,
    eps: float,
) -> torch.Tensor:
    """Returns cat([enorm(masked embeds), hnorm(prev_hidden)]) -> [N, 2H]."""
    n, h = inputs_embeds.shape
    out = torch.empty(n, 2 * h, dtype=inputs_embeds.dtype, device=inputs_embeds.device)
    _fused_eh_norm_kernel[(n,)](
        positions,
        inputs_embeds,
        inputs_embeds.stride(0),
        previous_hidden,
        previous_hidden.stride(0),
        enorm_w,
        hnorm_w,
        eps,
        out,
        out.stride(0),
        h,
        triton.next_power_of_2(h),
    )
    return out

`fused_q(positions, q_pe, q_pe_cos_sin_cache, index_q, index_q_cos_sin_cache, ql_nope, q_scale, index_weights, index_weights_softmax_scale, index_weights_head_scale, has_indexer=True, index_rope_interleave=False, quantize_mqa=True)` ¶

Fuse the MQA-query and indexer-query RoPE/quantization.

Returns (index_q_fp8, index_weights_out, mqa_q). When quantize_mqa is True (FlashInfer sparse, fp8 query) mqa_q is a single fp8 tensor packing [ql_nope; q_pe]. When False (FlashMLA sparse, bf16 query) it is the RoPE'd q_pe in bf16; the caller pairs it with ql_nope as the (ql_nope, q_pe) tuple the backend expects.

Source code in vllm/models/deepseek_v32/nvidia/kernels.py

def fused_q(
    positions: torch.Tensor,
    q_pe: torch.Tensor,
    q_pe_cos_sin_cache: torch.Tensor,
    index_q: torch.Tensor | None,
    index_q_cos_sin_cache: torch.Tensor | None,
    ql_nope: torch.Tensor,
    q_scale: torch.Tensor,
    # Index weights
    index_weights: torch.Tensor | None,
    index_weights_softmax_scale: float,
    index_weights_head_scale: float,
    has_indexer: bool = True,
    index_rope_interleave: bool = False,
    quantize_mqa: bool = True,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Fuse the MQA-query and indexer-query RoPE/quantization.

    Returns ``(index_q_fp8, index_weights_out, mqa_q)``. When ``quantize_mqa``
    is True (FlashInfer sparse, fp8 query) ``mqa_q`` is a single fp8 tensor
    packing ``[ql_nope; q_pe]``. When False (FlashMLA sparse, bf16 query) it is
    the RoPE'd ``q_pe`` in bf16; the caller pairs it with ``ql_nope`` as the
    ``(ql_nope, q_pe)`` tuple the backend expects.
    """
    assert positions.ndim == 1
    assert q_pe.ndim == 3
    assert q_pe_cos_sin_cache.ndim == 2
    assert ql_nope.ndim == 3
    assert ql_nope.shape[:2] == q_pe.shape[:2]

    num_tokens = positions.shape[0]
    num_q_heads = q_pe.shape[1]
    # Grid's 3rd dim must cover the MQA-pack heads (pid 0/2 iterate 2 heads
    # each) and, when present, the indexer heads (pid 1).
    mqa_grid_heads = (num_q_heads + 1) // 2
    if not has_indexer:
        # Shared layer: cached 1-element dummies; pid 1 skipped by HAS_INDEXER
        # and never dereferences them.
        index_q = _dummy((1, 1, 1), q_pe.dtype, q_pe.device)
        index_q_cos_sin_cache = q_pe_cos_sin_cache
        index_weights = _dummy((1, 1), torch.float32, q_pe.device)
    assert index_q is not None and index_q.ndim == 3
    assert index_q_cos_sin_cache is not None
    assert index_weights is not None
    num_index_q_heads = index_q.shape[1]
    index_q_head_dim = index_q.shape[2]
    grid_heads = max(mqa_grid_heads, num_index_q_heads)
    if quantize_mqa:
        # fp8 path: pack [ql_nope; q_pe] into a single fp8 tensor.
        mqa_q_fp8 = torch.empty(
            q_pe.shape[0],
            q_pe.shape[1],
            ql_nope.shape[2] + q_pe.shape[2],
            dtype=torch.float8_e4m3fn,
            device=q_pe.device,
        )
        # Placeholder; pid 0 packs q_pe into mqa_q_fp8 instead.
        q_pe_out = mqa_q_fp8
        mqa_q = mqa_q_fp8
    else:
        # bf16 path: only the RoPE'd q_pe is produced; ql_nope used directly.
        q_pe_out = torch.empty_like(q_pe)
        mqa_q_fp8 = q_pe_out  # unused placeholder for the fp8 pack pointer
        mqa_q = q_pe_out

    index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn)
    index_weights_out = torch.empty_like(index_weights, dtype=torch.float32)
    _fused_q_kernel[(3, num_tokens, grid_heads)](
        positions,
        q_pe,
        q_pe.stride(0),
        q_pe.stride(1),
        num_q_heads,
        q_pe_cos_sin_cache,
        q_pe_cos_sin_cache.stride(0),
        q_pe_cos_sin_cache.shape[-1] // 2,
        index_q,
        index_q.stride(0),
        index_q.stride(1),
        num_index_q_heads,
        index_q_cos_sin_cache,
        index_q_cos_sin_cache.stride(0),
        index_q_cos_sin_cache.shape[-1] // 2,
        index_q_fp8,
        index_q_fp8.stride(0),
        index_q_fp8.stride(1),
        index_q_head_dim,
        ql_nope,
        ql_nope.stride(0),
        ql_nope.stride(1),
        mqa_q_fp8,
        mqa_q_fp8.stride(0),
        mqa_q_fp8.stride(1),
        q_scale,
        ql_nope.shape[2],
        triton.next_power_of_2(ql_nope.shape[2]),
        q_pe_out,
        q_pe_out.stride(0),
        q_pe_out.stride(1),
        index_weights,
        index_weights.stride(0),
        index_weights_softmax_scale,
        index_weights_head_scale,
        index_weights_out,
        index_weights_out.stride(0),
        HAS_INDEXER=has_indexer,
        INDEX_ROPE_INTERLEAVE=index_rope_interleave,
        QUANTIZE_MQA=quantize_mqa,
        # num_warps=1 is optimal here: each program is a single 128-element
        # rope+quant, so the kernel is program-count/occupancy bound, not
        # per-program compute bound (swept 1/2/4/8 — 1 wins or ties everywhere).
        num_warps=1,
    )
    return index_q_fp8, index_weights_out, mqa_q

vllm.models.deepseek_v32.nvidia.kernels ¶

_fp8_ue8m0_quantize(vals) ¶

_fused_eh_norm_kernel(pos_ptr, embeds_ptr, embeds_stride, prev_ptr, prev_stride, enorm_w_ptr, hnorm_w_ptr, eps, out_ptr, out_stride, H, BLOCK) ¶

fused_eh_norm(positions, inputs_embeds, previous_hidden, enorm_w, hnorm_w, eps) ¶

fused_q(positions, q_pe, q_pe_cos_sin_cache, index_q, index_q_cos_sin_cache, ql_nope, q_scale, index_weights, index_weights_softmax_scale, index_weights_head_scale, has_indexer=True, index_rope_interleave=False, quantize_mqa=True) ¶

`vllm.models.deepseek_v32.nvidia.kernels` ¶

`_fp8_ue8m0_quantize(vals)` ¶

`_fused_eh_norm_kernel(pos_ptr, embeds_ptr, embeds_stride, prev_ptr, prev_stride, enorm_w_ptr, hnorm_w_ptr, eps, out_ptr, out_stride, H, BLOCK)` ¶

`fused_eh_norm(positions, inputs_embeds, previous_hidden, enorm_w, hnorm_w, eps)` ¶

`fused_q(positions, q_pe, q_pe_cos_sin_cache, index_q, index_q_cos_sin_cache, ql_nope, q_scale, index_weights, index_weights_softmax_scale, index_weights_head_scale, has_indexer=True, index_rope_interleave=False, quantize_mqa=True)` ¶