llmcompressor.utils.helpers

General utility helper functions. Common functions for interfacing with python primitives and directories/files.

Functions:

DisableQuantization –

Disable quantization during forward passes after applying a quantization config
calibration_forward_context –

Context in which all calibration forward passes should occur.
disable_cache –

Temporarily disable the key-value cache for transformer models. Used to prevent
disable_hf_kernels –

In transformers>=4.50.0, some module forward methods may be
disable_lm_head –

Disable the lm_head of a model by moving it to the meta device. This function
eval_context –

Disable pytorch training mode for the given module
import_from_path –

Import the module and the name of the function/class separated by :

DisableQuantization

DisableQuantization(module: Module)

Disable quantization during forward passes after applying a quantization config

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def DisableQuantization(module: torch.nn.Module):
    """
    Disable quantization during forward passes after applying a quantization config
    """
    try:
        module.apply(disable_quantization)
        yield
    finally:
        module.apply(enable_quantization)

calibration_forward_context

calibration_forward_context(model: Module)

Context in which all calibration forward passes should occur.

Remove gradient calculations
Disable the KV cache
Disable train mode and enable eval mode
Disable hf kernels which could bypass hooks
Disable lm head (input and weights can still be calibrated, output will be meta)

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def calibration_forward_context(model: torch.nn.Module):
    """
    Context in which all calibration forward passes should occur.

    - Remove gradient calculations
    - Disable the KV cache
    - Disable train mode and enable eval mode
    - Disable hf kernels which could bypass hooks
    - Disable lm head (input and weights can still be calibrated, output will be meta)
    """
    with contextlib.ExitStack() as stack:
        stack.enter_context(torch.no_grad())
        stack.enter_context(disable_cache(model))
        stack.enter_context(eval_context(model))
        stack.enter_context(disable_hf_kernels(model))
        stack.enter_context(disable_lm_head(model))
        yield

disable_cache

disable_cache(module: Module)

Temporarily disable the key-value cache for transformer models. Used to prevent excess memory use in one-shot cases where the model only performs the prefill phase and not the generation phase.

Example:

model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") input = torch.randint(0, 32, size=(1, 32)) with disable_cache(model): ... output = model(input)

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def disable_cache(module: torch.nn.Module):
    """
    Temporarily disable the key-value cache for transformer models. Used to prevent
    excess memory use in one-shot cases where the model only performs the prefill
    phase and not the generation phase.

    Example:
    >>> model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    >>> input = torch.randint(0, 32, size=(1, 32))
    >>> with disable_cache(model):
    ...     output = model(input)
    """

    if isinstance(module, PreTrainedModel):
        config = module.config
        config = getattr(config, "text_config", config)
        with patch_attr(config, "use_cache", False):
            yield

    else:
        yield

disable_hf_kernels

disable_hf_kernels(module: Module)

In transformers>=4.50.0, some module forward methods may be replaced by calls to hf hub kernels. This has the potential to bypass hooks added by LLM Compressor

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def disable_hf_kernels(module: torch.nn.Module):
    """
    In transformers>=4.50.0, some module forward methods may be
    replaced by calls to hf hub kernels. This has the potential
    to bypass hooks added by LLM Compressor
    """
    if isinstance(module, PreTrainedModel):
        with patch_attr(module.config, "disable_custom_kernels", True):
            yield

    else:
        yield

disable_lm_head

disable_lm_head(model: Module)

Disable the lm_head of a model by moving it to the meta device. This function does not untie parameters and restores the model proper loading upon exit

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def disable_lm_head(model: torch.nn.Module):
    """
    Disable the lm_head of a model by moving it to the meta device. This function
    does not untie parameters and restores the model proper loading upon exit
    """
    _, lm_head = get_embeddings(model)
    if lm_head is None:
        logger.warning(
            f"Attempted to disable lm_head of instance {model.__class__.__name__}, "
            "but was unable to to find lm_head. This may lead to unexpected OOM."
        )
        yield
        return

    elif not isinstance(lm_head, torch.nn.Linear):
        logger.warning(f"Cannot disable LM head of type {lm_head.__class__.__name__}")
        yield
        return

    else:
        dummy_weight = lm_head.weight.to("meta")

        def dummy_forward(self, input: torch.Tensor) -> torch.Tensor:
            return input.to("meta") @ dummy_weight.T

        with contextlib.ExitStack() as stack:
            lm_head_forward = dummy_forward.__get__(lm_head)
            stack.enter_context(patch_attr(lm_head, "forward", lm_head_forward))

            if hasattr(model, "_hf_hook"):
                stack.enter_context(patch_attr(model._hf_hook, "io_same_device", False))

            yield

eval_context

eval_context(module: Module)

Disable pytorch training mode for the given module

Source code in src/llmcompressor/utils/helpers.py

@contextlib.contextmanager
def eval_context(module: torch.nn.Module):
    """
    Disable pytorch training mode for the given module
    """
    restore_value = module.training
    try:
        module.train(False)  # equivalent to eval()
        yield

    finally:
        module.train(restore_value)

import_from_path

import_from_path(path: str) -> str

Import the module and the name of the function/class separated by : Examples: path = "/path/to/file.py:func_or_class_name" path = "/path/to/file:focn" path = "path.to.file:focn"

Parameters:

path (str) –

path including the file path and object name

Source code in src/llmcompressor/utils/helpers.py

def import_from_path(path: str) -> str:
    """
    Import the module and the name of the function/class separated by :
    Examples:
      path = "/path/to/file.py:func_or_class_name"
      path = "/path/to/file:focn"
      path = "path.to.file:focn"
    :param path: path including the file path and object name
    :return Function or class object
    """
    original_path, class_name = path.split(":")
    _path = original_path

    path = original_path.split(".py")[0]
    path = re.sub(r"/+", ".", path)
    try:
        module = importlib.import_module(path)
    except ImportError:
        raise ImportError(f"Cannot find module with path {_path}")

    try:
        return getattr(module, class_name)
    except AttributeError:
        raise AttributeError(f"Cannot find {class_name} in {_path}")