llmcompressor.modifiers.transform.awq.dynamic_mappings

Dynamic AWQ mapping builders for hybrid attention models.

Models with hybrid attention (mix of full self-attention and linear/Gated DeltaNet attention) need layer-index-specific AWQ mappings that vary by model size. This module provides runtime detection and mapping generation for such architectures (e.g. Qwen3Next, Qwen3.5).

Functions:

get_layer_mappings_from_model –

Infer AWQ mappings from a model. Checks the dynamic mapping registry

build_hybrid_attention_mappings

build_hybrid_attention_mappings(
    model: Module,
) -> list[AWQMapping] | None

Dynamically build AWQ mappings for models with hybrid attention (full self-attention + linear/Gated DeltaNet attention), such as Qwen3Next and Qwen3.5.

Reads layer_types from the model config to determine which layers use full vs linear attention, then inspects the model's module names to detect the correct linear attention projection names and MLP structure.

Returns None if the model is not a hybrid attention model.

Source code in src/llmcompressor/modifiers/transform/awq/dynamic_mappings.py

def build_hybrid_attention_mappings(model: Module) -> list[AWQMapping] | None:
    """
    Dynamically build AWQ mappings for models with hybrid attention
    (full self-attention + linear/Gated DeltaNet attention), such as
    Qwen3Next and Qwen3.5.

    Reads layer_types from the model config to determine which layers use
    full vs linear attention, then inspects the model's module names to
    detect the correct linear attention projection names and MLP structure.

    Returns None if the model is not a hybrid attention model.
    """
    result = _get_hybrid_attention_config(model)
    if result is None:
        return None

    layer_types, num_layers = result

    full_indices = [i for i in range(num_layers) if layer_types[i] == "full_attention"]
    linear_indices = [
        i for i in range(num_layers) if layer_types[i] == "linear_attention"
    ]

    if not full_indices or not linear_indices:
        logger.warning(
            "Hybrid attention model detected but could not find indices for "
            "both full and linear attention layers. Falling back."
        )
        return None

    full_re = "|".join(str(i) for i in full_indices)
    linear_re = "|".join(str(i) for i in linear_indices)

    linear_proj_names = _detect_linear_attn_projections(model)
    is_moe = is_moe_model(model)

    mappings = []

    # Full attention layers: input_layernorm -> q/k/v_proj
    mappings.append(
        AWQMapping(
            f"re:.*layers\\.({full_re})\\.input_layernorm$",
            [
                "re:.*self_attn.q_proj$",
                "re:.*self_attn.k_proj$",
                "re:.*self_attn.v_proj$",
            ],
        )
    )

    # Linear attention layers: input_layernorm -> linear_attn projections
    if linear_proj_names:
        mappings.append(
            AWQMapping(
                f"re:.*layers\\.({linear_re})\\.input_layernorm$",
                [f"re:.*linear_attn.{p}$" for p in linear_proj_names],
            )
        )

    # MLP mappings depend on whether the model uses MoE
    if is_moe:
        mappings.append(
            AWQMapping(
                "re:.*post_attention_layernorm$",
                [
                    # TODO: should add "re:.*mlp.gate.weight$" but is a Parameter
                    "re:.*mlp.experts.*.gate_proj$",
                    "re:.*mlp.experts.*.up_proj$",
                    "re:.*mlp.shared_expert_gate$",
                    "re:.*mlp.shared_expert.gate_proj$",
                    "re:.*mlp.shared_expert.up_proj$",
                ],
            )
        )
    else:
        mappings.append(
            AWQMapping(
                "re:.*post_attention_layernorm$",
                ["re:.*gate_proj$", "re:.*up_proj$"],
            )
        )

    mappings.append(AWQMapping("re:.*up_proj$", ["re:.*down_proj$"]))

    logger.info(
        f"Built dynamic hybrid attention AWQ mappings: "
        f"{len(full_indices)} full-attention layers, "
        f"{len(linear_indices)} linear-attention layers, "
        f"linear projections: {linear_proj_names}, MoE: {is_moe}"
    )

    return mappings

get_layer_mappings_from_model

get_layer_mappings_from_model(
    model: Module,
) -> list[AWQMapping]

Infer AWQ mappings from a model. Checks the dynamic mapping registry first (for models needing runtime-generated mappings), then falls back to the static registry, then to default mappings.

Parameters:

model (Module) –

the model to infer mappings for

Returns:

list[AWQMapping] –

list of AWQMapping for the model

Source code in src/llmcompressor/modifiers/transform/awq/dynamic_mappings.py

def get_layer_mappings_from_model(model: Module) -> list[AWQMapping]:
    """
    Infer AWQ mappings from a model. Checks the dynamic mapping registry
    first (for models needing runtime-generated mappings), then falls back
    to the static registry, then to default mappings.

    :param model: the model to infer mappings for
    :return: list of AWQMapping for the model
    """
    model_name = model.__class__.__name__

    if model_name in AWQ_DYNAMIC_MAPPING_REGISTRY:
        mappings = AWQ_DYNAMIC_MAPPING_REGISTRY[model_name](model)
        if mappings is not None:
            return mappings

    if model_name in AWQ_MAPPING_REGISTRY:
        return AWQ_MAPPING_REGISTRY[model_name]

    logger.info(
        f"Architecture {model_name} not found in mappings. "
        f"Using default mappings: {default_mappings}"
    )
    return default_mappings