Skip to content

speculators.models

Modules:

  • base_components

    Shared base model components for all speculator types.

  • eagle

    Speculators implementations providing a unified implementation

  • eagle3
  • mlp

Classes:

Eagle3DraftModel

Eagle3DraftModel(
    config: Eagle3SpeculatorConfig,
    t2d: Tensor | None,
    d2t: Tensor | None,
)

Bases: SpeculatorModel

Methods:

Source code in speculators/models/eagle3/core.py
def __init__(
    self,
    config: Eagle3SpeculatorConfig,
    t2d: torch.Tensor | None,
    d2t: torch.Tensor | None,
):
    super().__init__(
        config=config,
        verifier=None,
        verifier_attachment_mode="train_only",
    )
    self.hidden_size = config.transformer_layer_config.hidden_size
    self.draft_vocab_size = config.draft_vocab_size

    # Verify that if one mapping tensor is provided, the other is as well
    if (t2d is None) != (d2t is None):
        raise ValueError(
            "Both t2d and d2t must be provided together, or both must be None. "
            f"Got t2d={'provided' if t2d is not None else 'None'}, "
            f"d2t={'provided' if d2t is not None else 'None'}"
        )

    # Register buffers - they can be None
    if t2d is not None:
        self.register_buffer("t2d", t2d)  # shape: [verifier_vocab_size], bool
        if int(t2d.sum(dtype=torch.long).item()) != self.draft_vocab_size:
            raise ValueError(
                f"t2d has {int(t2d.sum(dtype=torch.long).item())} non-zero values, "
                f"expected {self.draft_vocab_size}."
            )
    else:
        self.register_buffer("t2d", None)

    if d2t is not None:
        self.register_buffer("d2t", d2t)  # shape: [draft_vocab_size], int offsets
        if d2t.shape[0] != self.draft_vocab_size:
            raise ValueError(
                f"d2t.shape[0] ({d2t.shape[0]}) must match"
                f" draft_vocab_size ({self.draft_vocab_size})."
            )
    else:
        self.register_buffer("d2t", None)

    self.fc = torch.nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False)
    self._model_definitions = model_classes[
        config.transformer_layer_config.model_type
    ]
    self._setup_decoder_layers(
        config.transformer_layer_config, config.norm_before_residual
    )
    self.norm = self._model_definitions.norm_class(
        self.hidden_size, eps=config.transformer_layer_config.rms_norm_eps
    )
    self._setup_rotary_embedding(config.transformer_layer_config)
    self._setup_embeddings_and_lm_heads(
        config.speculators_config.verifier, t2d, config.embed_requires_grad
    )

from_training_args classmethod

from_training_args(
    verifier_config: PretrainedConfig, **kwargs
) -> Eagle3DraftModel

Create Eagle3 model from training arguments.

Args: verifier_config: Verifier model configuration **kwargs: Training arguments with Eagle3-specific params - num_layers: Number of decoder layers - norm_before_residual: Whether to normalize before residual connection - t2d: Target-to-draft vocabulary mapping tensor - d2t: Draft-to-target vocabulary mapping tensor - ttt_steps: Number of TTT steps - verifier_name_or_path: Path to verifier model

Returns: Initialized Eagle3DraftModel

Source code in speculators/models/eagle3/core.py
@classmethod
def from_training_args(
    cls,
    verifier_config: PretrainedConfig,
    **kwargs,
) -> "Eagle3DraftModel":
    """Create Eagle3 model from training arguments.

    Args:
        verifier_config: Verifier model configuration
        **kwargs: Training arguments with Eagle3-specific params
            - num_layers: Number of decoder layers
            - norm_before_residual: Whether to normalize before residual connection
            - t2d: Target-to-draft vocabulary mapping tensor
            - d2t: Draft-to-target vocabulary mapping tensor
            - ttt_steps: Number of TTT steps
            - verifier_name_or_path: Path to verifier model

    Returns:
        Initialized Eagle3DraftModel
    """
    config = Eagle3SpeculatorConfig(
        transformer_layer_config=verifier_config,
        draft_vocab_size=kwargs["draft_vocab_size"],
        norm_before_residual=kwargs["norm_before_residual"],
        embed_requires_grad=kwargs.get("embed_requires_grad", False),
        speculators_config=SpeculatorsConfig(
            algorithm="eagle3",
            proposal_methods=[
                GreedyTokenProposalConfig(
                    speculative_tokens=kwargs["ttt_steps"],
                )
            ],
            default_proposal_method="greedy",
            verifier=VerifierConfig.from_config(
                verifier_config, name_or_path=kwargs["verifier_name_or_path"]
            ),
        ),
    )

    return cls(config=config, t2d=kwargs.get("t2d"), d2t=kwargs.get("d2t"))

get_trainer_kwargs staticmethod

get_trainer_kwargs(**kwargs) -> tuple[dict, dict]

Get training and validation kwargs for Eagle3.

Args: **kwargs: Training arguments

Returns: Tuple of (train_call_kwargs, val_call_kwargs)

Source code in speculators/models/eagle3/core.py
@staticmethod
def get_trainer_kwargs(**kwargs) -> tuple[dict, dict]:
    """Get training and validation kwargs for Eagle3.

    Args:
        **kwargs: Training arguments

    Returns:
        Tuple of (train_call_kwargs, val_call_kwargs)
    """
    train_kwargs = {
        "use_off_policy_tokens": kwargs["use_off_policy_tokens"],
        "ttt_steps": kwargs["ttt_steps"],
        "ttt_step_loss_decay": kwargs["ttt_step_loss_decay"],
    }
    val_kwargs = {
        "use_off_policy_tokens": False,
        "ttt_steps": kwargs["ttt_steps"],
        "ttt_step_loss_decay": kwargs["ttt_step_loss_decay"],
    }
    return train_kwargs, val_kwargs

Eagle3SpeculatorConfig

Eagle3SpeculatorConfig(**kwargs)

Bases: SpeculatorModelConfig

Configuration for EAGLE-3 speculator with vocabulary mapping.

EAGLE-3 features vocabulary mapping between draft (32K) and target (128K) vocabularies, enabling cross-tokenizer speculation.

Parameters:

  • transformer_layer_config

    Configuration for the transformer decoder layer

  • draft_vocab_size

    Size of draft model vocabulary for speculation

  • norm_before_residual

    Apply hidden_norm before storing residual

Methods:

Attributes:

Source code in speculators/config.py
def __init__(self, **kwargs):
    # initialize the Pydantic arguments first to set all valid fields
    PydanticClassRegistryMixin.__init__(self, **kwargs)

    # reset kwargs handled by Pydantic so PretrainedConfig doesn't override
    for field in self.__class__.model_fields:
        kwargs[field] = getattr(self, field)

    # initialize the Hugging Face PretrainedConfig arguments for the model
    PretrainedConfig.__init__(self, **kwargs)

    # ensure we always update the transformers version
    self.transformers_version = version("transformers")

target_vocab_size property

target_vocab_size: int

Get target vocabulary size from transformer config.

serialize_transformer_config

serialize_transformer_config(
    value: PretrainedConfig,
) -> dict

Serialize transformer config to dict.

Source code in speculators/models/eagle3/config.py
@field_serializer("transformer_layer_config")
def serialize_transformer_config(self, value: PretrainedConfig) -> dict:
    """Serialize transformer config to dict."""
    return value.to_diff_dict()

validate_transformer_config classmethod

validate_transformer_config(value: Any) -> PretrainedConfig

Validate and convert transformer config.

Source code in speculators/models/eagle3/config.py
@field_validator("transformer_layer_config", mode="before")
@classmethod
def validate_transformer_config(cls, value: Any) -> PretrainedConfig:
    """Validate and convert transformer config."""
    if isinstance(value, dict):
        config_class: type[PretrainedConfig] = LlamaConfig
        if "model_type" in value:
            config_class = AutoConfig.for_model(
                model_type=value["model_type"]
            ).__class__
        return config_class(**value)
    return value

EagleSpeculator

EagleSpeculator(
    config: EagleSpeculatorConfig,
    verifier: str
    | PathLike
    | PreTrainedModel
    | None = None,
    verifier_attachment_mode: Literal[
        "detached", "full", "train_only"
    ]
    | None = None,
)

Bases: SpeculatorModel

A SpeculatorModel implementation for EAGLE and HASS variants for spec decoding: - Eagle / Eagle v1: https://arxiv.org/abs/2401.15077 - Eagle v2: https://arxiv.org/abs/2406.16858 - HASS: https://arxiv.org/abs/2408.15766

Architecture Overview: The EAGLE speculator consists of: 1. Input embedding layer (shared with verifier) 2. Optional embedding layer normalization 3. Fusion layer: Concatenates and projects input embeddings + verifier hidden states to a latent space of hidden_size 4. Single transformer decoder layer for candidate token generation 5. Optional pre-LM head layer normalization 6. Language model head (shared with verifier)

Speculative Decoding Process: 1. Verifier model processes input and generates hidden states 2. EAGLE speculator uses these hidden states + input embeddings to predict next tokens 3. Multiple candidate tokens generated in parallel using token proposal methods 4. Verifier validates candidates and accepts/rejects based on probability thresholds 5. Process continues iteratively for multi-token speculation

Example:

from speculators import SpeculatorsConfig, VerifierConfig
from speculators.models import EagleSpeculator, EagleSpeculatorConfig
from speculators.proposals import GreedyTokenProposalConfig
from transformers import AutoConfig, AutoTokenizer

config = EagleSpeculatorConfig(
    transformer_layer_config=AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct"),
    speculators_config=SpeculatorsConfig(
        algorithm="eagle",
        proposal_methods=[
            GreedyTokenProposalConfig(),
        ],
        default_proposal_method="greedy",
        verifier=VerifierConfig(
            name_or_path="meta-llama/Llama-3.1-8B-Instruct",
            architectures=["LlamaForCausalLM"],
        )
)
speculator = EagleSpeculator(
    config, verifier=verifier, verifier_attachment_mode="full"
)

Initializes an EAGLE speculator architecture with configurable components based on the provided configuration. The model starts with verifier-dependent layers (embed_tokens, rotary_emb, lm_head) set to None until a verifier is attached.

Parameters:

  • config

    (EagleSpeculatorConfig) –

    Configuration object specifying model architecture, layer settings, and speculative decoding parameters. Must be an instance of EagleSpeculatorConfig containing transformer layer configuration and EAGLE-specific settings.

  • verifier

    (str | PathLike | PreTrainedModel | None, default: None ) –

    Optional verifier model to attach for speculative decoding. Can be a path to a model directory, Hugging Face model identifier, or PreTrainedModel instance. If None, must be attached later via attach_verifier() before using the model.

  • verifier_attachment_mode

    (Literal['detached', 'full', 'train_only'] | None, default: None ) –

    Mode for verifier attachment. "detached" prevents attachment even if verifier is provided. "full" enables complete integration for both training and generation. "train_only" attaches only components needed for training, optimizing memory usage.

Methods:

  • attach_verifier

    Attach a verifier model to the EagleSpeculator for speculative decoding.

  • detach_verifier

    Removes the reference to the attached verifier model and frees up the

  • forward

    Execute the forward pass for speculative token generation.

  • from_training_args

    Create EAGLE model from training arguments.

  • get_trainer_kwargs

    Get training and validation kwargs for EAGLE.

Source code in speculators/models/eagle.py
def __init__(
    self,
    config: EagleSpeculatorConfig,
    verifier: str | os.PathLike | PreTrainedModel | None = None,
    verifier_attachment_mode: Literal["detached", "full", "train_only"]
    | None = None,
):
    """
    Initializes an EAGLE speculator architecture with configurable components based
    on the provided configuration. The model starts with verifier-dependent layers
    (embed_tokens, rotary_emb, lm_head) set to None until a verifier is attached.

    :param config: Configuration object specifying model architecture, layer
        settings, and speculative decoding parameters. Must be an instance of
        EagleSpeculatorConfig containing transformer layer configuration and
        EAGLE-specific settings.
    :param verifier: Optional verifier model to attach for speculative decoding.
        Can be a path to a model directory, Hugging Face model identifier, or
        PreTrainedModel instance. If None, must be attached later via
        attach_verifier() before using the model.
    :param verifier_attachment_mode: Mode for verifier attachment. "detached"
        prevents attachment even if verifier is provided. "full" enables
        complete integration for both training and generation. "train_only"
        attaches only components needed for training, optimizing memory usage.
    """
    if not isinstance(config, EagleSpeculatorConfig):
        raise ValueError(
            "config must be an instance of EagleSpeculatorConfig, "
            f"got {type(config)} instead."
        )

    # Initialize model parameters from config
    self.vocab_size = config.transformer_layer_config.vocab_size
    self.hidden_size = config.transformer_layer_config.hidden_size
    self.padding_idx = config.transformer_layer_config.pad_token_id

    # Set layers pulled from the verifier to None until attach is called
    self.embed_tokens: nn.Embedding | None = None
    self.rotary_emb: nn.Module | None = None
    self.lm_head: nn.Linear | None = None

    # Delayed initialization to ensure everything needed for attach_verifier is set
    super().__init__(
        config=config,
        verifier=verifier,
        verifier_attachment_mode=verifier_attachment_mode,
    )

    self._decoder_class, self._layernorm_class = self._import_model_classes()
    # Initialize layers based on the configuration
    self.embedding_layernorm: nn.Module | None = self._create_layernorm()
    self.fusion_fc: nn.Linear = nn.Linear(
        2 * self.hidden_size,
        self.hidden_size,
        bias=config.fusion_bias,
    )
    self.transformer: nn.Module = self._create_transformer_layer()
    self.pre_lm_head_layernorm: nn.Module | None = self._create_layernorm()

    self.post_init()  # type: ignore[attr-defined]

attach_verifier

attach_verifier(
    verifier: str | PathLike | PreTrainedModel,
    mode: Literal["full", "train_only"] | None = None,
)

Attach a verifier model to the EagleSpeculator for speculative decoding. Utilizes the verifier's embed_tokens, rotary_emb, and lm_head layers for the speculator's forward pass and generation methods. Additionally, for generate, it uses the verifier's hidden states to generate speculative token predictions.

If mode is "full", the verifier is fully integrated for use with both generate and forward methods.

If mode is "train_only", only the verifier's layers required for a forward pass are attached, allowing for better resource utilization during training. generate will not be available until a full verifier is attached.

Example:

# Load and attach a verifier
verifier = EagleSpeculator(...)

# For generation
speculator.attach_verifier(verifier)
outputs = speculator.generate(input_ids)
speculator.detach_verifier()

# For training
speculator.attach_verifier(verifier, mode="train_only")
outputs = speculator(input_ids, hidden_states)
speculator.detach_verifier()

Parameters:

  • verifier

    (str | PathLike | PreTrainedModel) –

    The verifier model to attach. This can be a path to a local model directory, a Hugging Face model identifier, or an instance of PreTrainedModel. If a path or identifier is provided, the model will be loaded automatically. If an instance is provided, it will be used directly.

  • mode

    (Literal['full', 'train_only'] | None, default: None ) –

    The mode for attaching the verifier. Can be "full" or "train_only". If None, defaults to "full". In "train_only" mode, only the layers required for a forward pass are attached, and the speculator cannot perform generation until a full verifier is attached.

Returns:

  • The PreTrainedModel instance for the verifier that was attached.

Source code in speculators/models/eagle.py
def attach_verifier(
    self,
    verifier: str | os.PathLike | PreTrainedModel,
    mode: Literal["full", "train_only"] | None = None,
):
    """
    Attach a verifier model to the EagleSpeculator for speculative decoding.
    Utilizes the verifier's embed_tokens, rotary_emb, and lm_head layers
    for the speculator's forward pass and generation methods.
    Additionally, for `generate`, it uses the verifier's hidden states
    to generate speculative token predictions.

    If mode is "full", the verifier is fully integrated for use with
    both `generate` and `forward` methods.

    If mode is "train_only", only the verifier's layers required for a forward pass
    are attached, allowing for better resource utilization during training.
    `generate` will not be available until a full verifier is attached.

    Example:
        ```python
        # Load and attach a verifier
        verifier = EagleSpeculator(...)

        # For generation
        speculator.attach_verifier(verifier)
        outputs = speculator.generate(input_ids)
        speculator.detach_verifier()

        # For training
        speculator.attach_verifier(verifier, mode="train_only")
        outputs = speculator(input_ids, hidden_states)
        speculator.detach_verifier()
        ```

    :param verifier: The verifier model to attach. This can be a path to a local
        model directory, a Hugging Face model identifier, or an instance of
        PreTrainedModel. If a path or identifier is provided, the model will be
        loaded automatically. If an instance is provided, it will be used directly.
    :param mode: The mode for attaching the verifier. Can be "full" or "train_only".
        If None, defaults to "full". In "train_only" mode, only the layers
        required for a forward pass are attached, and the speculator cannot
        perform generation until a full verifier is attached.
    :return: The PreTrainedModel instance for the verifier that was attached.
    """
    super().attach_verifier(verifier=verifier, mode=mode)

    if self.verifier_attachment_mode == "train_only":
        verifier_model = self.resolve_verifier(verifier)
    elif self.verifier_attachment_mode == "full":
        verifier_model = cast("PreTrainedModel", self.verifier)
    else:
        return

    if hasattr(verifier_model, "model"):
        self.embed_tokens = verifier_model.model.embed_tokens  # type: ignore[assignment,union-attr]
        self.rotary_emb = verifier_model.model.rotary_emb  # type: ignore[assignment,union-attr]
    else:
        # Bare model structure
        self.embed_tokens = verifier_model.embed_tokens  # type: ignore[assignment,attr-defined]
        self.rotary_emb = verifier_model.rotary_emb  # type: ignore[assignment,attr-defined]

    # lm_head is always at the top level of the verifier
    self.lm_head = verifier_model.lm_head  # type: ignore[assignment,attr-defined]

detach_verifier

detach_verifier()

Removes the reference to the attached verifier model and frees up the associated memory. After calling this method, the speculator will not be able to perform forward passes or generation until a new verifier is attached.

Source code in speculators/models/eagle.py
def detach_verifier(self):
    """
    Removes the reference to the attached verifier model and frees up the
    associated memory. After calling this method, the speculator will not
    be able to perform forward passes or generation until a new verifier
    is attached.
    """
    super().detach_verifier()

    del self.embed_tokens
    self.embed_tokens = None
    del self.rotary_emb
    self.rotary_emb = None
    del self.lm_head
    self.lm_head = None

forward

forward(
    input_ids: LongTensor,
    hidden_states: FloatTensor,
    attention_mask: Tensor | None = None,
    position_ids: LongTensor | None = None,
    past_key_values: tuple[tuple[FloatTensor]]
    | None = None,
    use_cache: bool | None = None,
    output_attentions: bool | None = None,
    output_hidden_states: bool | None = None,
    return_dict: bool | None = None,
) -> torch.FloatTensor | CausalLMOutputWithPast

Execute the forward pass for speculative token generation.

Processes input tokens and verifier hidden states through the EAGLE architecture to generate candidate tokens for speculative decoding. The method combines input embeddings with verifier hidden states via a fusion layer, processes them through a transformer decoder layer, and produces logits for next token prediction.

Parameters:

  • input_ids

    (LongTensor) –

    Token IDs for the current input sequence. Shape: (batch_size, sequence_length). These represent the tokens that will be converted to embeddings and combined with verifier hidden states.

  • hidden_states

    (FloatTensor) –

    Hidden state representations from the verifier model corresponding to the input sequence. Shape: (batch_size, sequence_length, hidden_size). These capture the verifier's understanding of the context.

  • attention_mask

    (Tensor | None, default: None ) –

    Optional attention mask to avoid attending to padding tokens. Shape: (batch_size, sequence_length) for 2D or (batch_size, 1, sequence_length, sequence_length) for 4D causal mask.

  • position_ids

    (LongTensor | None, default: None ) –

    Optional position indices for tokens in the sequence. Shape: (batch_size, sequence_length). If None, auto-generated based on sequence length and past key values.

  • past_key_values

    (tuple[tuple[FloatTensor]] | None, default: None ) –

    Optional cached key-value states from previous forward passes for efficient generation. Tuple of layer key-value pairs.

  • use_cache

    (bool | None, default: None ) –

    Whether to return key-value states for caching in subsequent forward passes. Useful for autoregressive generation efficiency.

  • output_attentions

    (bool | None, default: None ) –

    Whether to return attention weights from the transformer layer. Used for analysis and visualization.

  • output_hidden_states

    (bool | None, default: None ) –

    Whether to return hidden states from the transformer layer. Currently not implemented in this model.

  • return_dict

    (bool | None, default: None ) –

    Whether to return structured CausalLMOutputWithPast instead of raw logits. If None, uses config.use_return_dict default.

Returns:

  • FloatTensor | CausalLMOutputWithPast

    Either raw logits tensor (batch_size, sequence_length, vocab_size) if return_dict=False, or CausalLMOutputWithPast containing logits, past key values, and optional attention weights.

Raises:

  • ValueError

    If verifier components (embed_tokens, rotary_emb, lm_head) are not attached. Call attach_verifier() before using forward().

Source code in speculators/models/eagle.py
def forward(
    self,
    input_ids: torch.LongTensor,
    hidden_states: torch.FloatTensor,
    attention_mask: torch.Tensor | None = None,
    position_ids: torch.LongTensor | None = None,
    past_key_values: tuple[tuple[torch.FloatTensor]] | None = None,
    use_cache: bool | None = None,
    output_attentions: bool | None = None,
    output_hidden_states: bool | None = None,  # noqa: ARG002
    return_dict: bool | None = None,
) -> torch.FloatTensor | CausalLMOutputWithPast:
    """
    Execute the forward pass for speculative token generation.

    Processes input tokens and verifier hidden states through the EAGLE architecture
    to generate candidate tokens for speculative decoding. The method combines input
    embeddings with verifier hidden states via a fusion layer, processes them
    through a transformer decoder layer, and produces logits for next token
    prediction.

    :param input_ids: Token IDs for the current input sequence. Shape: (batch_size,
        sequence_length). These represent the tokens that will be converted to
        embeddings and combined with verifier hidden states.
    :param hidden_states: Hidden state representations from the verifier model
        corresponding to the input sequence. Shape: (batch_size, sequence_length,
        hidden_size). These capture the verifier's understanding of the context.
    :param attention_mask: Optional attention mask to avoid attending to padding
        tokens. Shape: (batch_size, sequence_length) for 2D or (batch_size, 1,
        sequence_length, sequence_length) for 4D causal mask.
    :param position_ids: Optional position indices for tokens in the sequence.
        Shape: (batch_size, sequence_length). If None, auto-generated based on
        sequence length and past key values.
    :param past_key_values: Optional cached key-value states from previous forward
        passes for efficient generation. Tuple of layer key-value pairs.
    :param use_cache: Whether to return key-value states for caching in subsequent
        forward passes. Useful for autoregressive generation efficiency.
    :param output_attentions: Whether to return attention weights from the
        transformer layer. Used for analysis and visualization.
    :param output_hidden_states: Whether to return hidden states from the
        transformer layer. Currently not implemented in this model.
    :param return_dict: Whether to return structured CausalLMOutputWithPast instead
        of raw logits. If None, uses config.use_return_dict default.
    :return: Either raw logits tensor (batch_size, sequence_length, vocab_size) if
        return_dict=False, or CausalLMOutputWithPast containing logits, past key
        values, and optional attention weights.
    :raises ValueError: If verifier components (embed_tokens, rotary_emb, lm_head)
        are not attached. Call attach_verifier() before using forward().
    """
    if self.embed_tokens is None or self.rotary_emb is None or self.lm_head is None:
        raise ValueError(
            "Verifier model layers not initialized. "
            "Call `attach_verifier` to set up the model before using forward."
        )

    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    inputs_embeds = self.embed_tokens(input_ids)
    if self.embedding_layernorm is not None:
        inputs_embeds = self.embedding_layernorm(inputs_embeds)

    hidden_states = self.fusion_fc(
        torch.cat([inputs_embeds, hidden_states], dim=-1)
    )
    hidden_states, attention_mask, position_ids = self._prepare_decoder_inputs(
        hidden_states, attention_mask, position_ids, past_key_values
    )

    cos, sin = self.rotary_emb(hidden_states, position_ids)
    layer_outputs = self.transformer(
        hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_values[0] if past_key_values else None,
        output_attentions=output_attentions,
        use_cache=use_cache,
        position_embeddings=(cos, sin),
    )
    hidden_states = layer_outputs[0]

    if self.pre_lm_head_layernorm is not None:
        hidden_states = self.pre_lm_head_layernorm(hidden_states)

    logits = self.lm_head(hidden_states)

    if not return_dict:
        return logits

    return CausalLMOutputWithPast(
        logits=logits,
        past_key_values=layer_outputs[1] if use_cache else None,
        hidden_states=None,
        attentions=None,
    )

from_training_args classmethod

from_training_args(
    verifier_config: PretrainedConfig, **kwargs
) -> EagleSpeculator

Create EAGLE model from training arguments.

Args: verifier_config: Verifier model configuration **kwargs: Training arguments with EAGLE-specific params - layernorms: Whether to include layer normalization layers - fusion_bias: Whether to add bias to fusion layer - transformer_layer_architecture: Name of transformer decoder layer class - verifier_name_or_path: Path to verifier model

Returns: Initialized EagleSpeculator

Source code in speculators/models/eagle.py
@classmethod
def from_training_args(
    cls,
    verifier_config: PretrainedConfig,
    **kwargs,
) -> "EagleSpeculator":
    """Create EAGLE model from training arguments.

    Args:
        verifier_config: Verifier model configuration
        **kwargs: Training arguments with EAGLE-specific params
            - layernorms: Whether to include layer normalization layers
            - fusion_bias: Whether to add bias to fusion layer
            - transformer_layer_architecture: Name of transformer decoder layer
                class
            - verifier_name_or_path: Path to verifier model

    Returns:
        Initialized EagleSpeculator
    """
    config = EagleSpeculatorConfig(
        transformer_layer_config=verifier_config,
        layernorms=kwargs.get("layernorms", False),
        fusion_bias=kwargs.get("fusion_bias", False),
        transformer_layer_architecture=kwargs.get(
            "transformer_layer_architecture", "auto"
        ),
        speculators_config=SpeculatorsConfig(
            algorithm="eagle",
            proposal_methods=[GreedyTokenProposalConfig()],
            default_proposal_method="greedy",
            verifier=VerifierConfig.from_config(
                verifier_config, name_or_path=kwargs["verifier_name_or_path"]
            ),
        ),
    )

    return cls(config=config)

get_trainer_kwargs staticmethod

get_trainer_kwargs(**kwargs) -> tuple[dict, dict]

Get training and validation kwargs for EAGLE.

EAGLE doesn't require any special forward pass arguments during training, so this returns empty dictionaries.

Args: **kwargs: Training arguments (unused)

Returns: Tuple of (train_call_kwargs, val_call_kwargs), both empty dicts

Source code in speculators/models/eagle.py
@staticmethod
def get_trainer_kwargs(**kwargs) -> tuple[dict, dict]:  # noqa: ARG004
    """Get training and validation kwargs for EAGLE.

    EAGLE doesn't require any special forward pass arguments during training,
    so this returns empty dictionaries.

    Args:
        **kwargs: Training arguments (unused)

    Returns:
        Tuple of (train_call_kwargs, val_call_kwargs), both empty dicts
    """
    return {}, {}

EagleSpeculatorConfig

EagleSpeculatorConfig(**kwargs)

Bases: SpeculatorModelConfig

A SpeculatorModelConfig implementation to be used with the EagleSpeculator for EAGLE and HASS variants for spec decoding: - Eagle / Eagle v1: https://arxiv.org/abs/2401.15077 - Eagle v2: https://arxiv.org/abs/2406.16858 - HASS: https://arxiv.org/abs/2408.15766

Model Configurations: - EAGLE1: layernorms=False, fusion_bias=False - EAGLE2: layernorms=False, fusion_bias=False - HASS: layernorms=False, fusion_bias=True

Example:

from speculators import SpeculatorsConfig, VerifierConfig
from speculators.models import EagleSpeculatorConfig
from speculators.proposals import GreedyTokenProposalConfig
from transformers import AutoConfig

config = EagleSpeculatorConfig(
    transformer_layer_config=AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct"),
    speculators_config=SpeculatorsConfig(
        algorithm="eagle",
        proposal_methods=[
            GreedyTokenProposalConfig(),
        ],
        default_proposal_method="greedy",
        verifier=VerifierConfig(
            name_or_path="meta-llama/Llama-3.1-8B-Instruct",
            architectures=["LlamaForCausalLM"],
        )
)

Methods:

Source code in speculators/config.py
def __init__(self, **kwargs):
    # initialize the Pydantic arguments first to set all valid fields
    PydanticClassRegistryMixin.__init__(self, **kwargs)

    # reset kwargs handled by Pydantic so PretrainedConfig doesn't override
    for field in self.__class__.model_fields:
        kwargs[field] = getattr(self, field)

    # initialize the Hugging Face PretrainedConfig arguments for the model
    PretrainedConfig.__init__(self, **kwargs)

    # ensure we always update the transformers version
    self.transformers_version = version("transformers")

check_add_architectures

check_add_architectures() -> Self

Automatically adds the transformer layer architecture to the architectures list if it's not already present.

Returns:

  • Self

    The validated configuration instance with updated architectures

Source code in speculators/models/eagle.py
@model_validator(mode="after")
def check_add_architectures(self) -> Self:
    """
    Automatically adds the transformer layer architecture to the
    architectures list if it's not already present.

    :return: The validated configuration instance with updated architectures
    """
    if (
        self.transformer_layer_architecture != "auto"
        and self.transformer_layer_architecture not in self.architectures
    ):
        self.architectures.append(self.transformer_layer_architecture)

    return self

serialize_transformer_layer_config

serialize_transformer_layer_config(
    value: PretrainedConfig,
) -> dict

Serialize the transformer_layer_config to a dictionary for JSON storage.

Converts the PretrainedConfig object to its dictionary representation using to_diff_dict() to only include non-default values.

Parameters:

  • value

    (PretrainedConfig) –

    The PretrainedConfig instance to serialize

Returns:

  • dict

    Dictionary representation of the transformer layer configuration

Source code in speculators/models/eagle.py
@field_serializer("transformer_layer_config")
def serialize_transformer_layer_config(self, value: PretrainedConfig) -> dict:
    """
    Serialize the transformer_layer_config to a dictionary for JSON storage.

    Converts the PretrainedConfig object to its dictionary representation
    using to_diff_dict() to only include non-default values.

    :param value: The PretrainedConfig instance to serialize
    :return: Dictionary representation of the transformer layer configuration
    """
    return value.to_diff_dict()

validate_transformer_layer_config classmethod

validate_transformer_layer_config(
    value: Any,
) -> PretrainedConfig

Validate and convert transformer_layer_config to a PretrainedConfig instance.

Accepts either a dictionary that can be converted to a PretrainedConfig or an existing PretrainedConfig instance.

Parameters:

  • value

    (Any) –

    The value to validate (dict or PretrainedConfig)

Returns:

  • PretrainedConfig

    A validated PretrainedConfig instance

Raises:

  • ValueError

    If the value cannot be converted to a PretrainedConfig

Source code in speculators/models/eagle.py
@field_validator("transformer_layer_config", mode="before")
@classmethod
def validate_transformer_layer_config(cls, value: Any) -> PretrainedConfig:
    """
    Validate and convert transformer_layer_config to a PretrainedConfig instance.

    Accepts either a dictionary that can be converted to a PretrainedConfig
    or an existing PretrainedConfig instance.

    :param value: The value to validate (dict or PretrainedConfig)
    :return: A validated PretrainedConfig instance
    :raises ValueError: If the value cannot be converted to a PretrainedConfig
    """
    if isinstance(value, dict):
        return AutoConfig.for_model(**value)
    if isinstance(value, PretrainedConfig):
        return value

    raise ValueError(
        "transformer_layer_config must be a PretrainedConfig instance or a "
        "dictionary that can be converted to a PretrainedConfig."
    )

MLPSpeculatorConfig

MLPSpeculatorConfig(**kwargs)

Bases: SpeculatorModelConfig

TODO

Source code in speculators/config.py
def __init__(self, **kwargs):
    # initialize the Pydantic arguments first to set all valid fields
    PydanticClassRegistryMixin.__init__(self, **kwargs)

    # reset kwargs handled by Pydantic so PretrainedConfig doesn't override
    for field in self.__class__.model_fields:
        kwargs[field] = getattr(self, field)

    # initialize the Hugging Face PretrainedConfig arguments for the model
    PretrainedConfig.__init__(self, **kwargs)

    # ensure we always update the transformers version
    self.transformers_version = version("transformers")