Skip to content

vllm.model_executor.models.transformers

Wrapper around transformers models

logger module-attribute

logger = init_logger(__name__)

ConfigOverride

Context manager to temporarily override config attributes.

Source code in vllm/model_executor/models/transformers.py
class ConfigOverride:
    """Context manager to temporarily override config attributes."""

    def __init__(self, config: PretrainedConfig, **kwargs):
        self.config = config
        self.kwargs = kwargs
        self.kwargs_original = {}
        self.kwargs_delete = set()

    def __enter__(self):
        """Override config attributes."""
        for key, value in self.kwargs.items():
            if not hasattr(self.config, key):
                self.kwargs_delete.add(key)
            self.kwargs_original[key] = getattr(self.config, key, None)
            setattr(self.config, key, value)
        return self.config

    def __exit__(self, exc_type, exc_value, traceback):
        """Restore original config attributes."""
        for key, value in self.kwargs_original.items():
            if key in self.kwargs_delete:
                delattr(self.config, key)
            else:
                setattr(self.config, key, value)

config instance-attribute

config = config

kwargs instance-attribute

kwargs = kwargs

kwargs_delete instance-attribute

kwargs_delete = set()

kwargs_original instance-attribute

kwargs_original = {}

__enter__

__enter__()

Override config attributes.

Source code in vllm/model_executor/models/transformers.py
def __enter__(self):
    """Override config attributes."""
    for key, value in self.kwargs.items():
        if not hasattr(self.config, key):
            self.kwargs_delete.add(key)
        self.kwargs_original[key] = getattr(self.config, key, None)
        setattr(self.config, key, value)
    return self.config

__exit__

__exit__(exc_type, exc_value, traceback)

Restore original config attributes.

Source code in vllm/model_executor/models/transformers.py
def __exit__(self, exc_type, exc_value, traceback):
    """Restore original config attributes."""
    for key, value in self.kwargs_original.items():
        if key in self.kwargs_delete:
            delattr(self.config, key)
        else:
            setattr(self.config, key, value)

__init__

__init__(config: PretrainedConfig, **kwargs)
Source code in vllm/model_executor/models/transformers.py
def __init__(self, config: PretrainedConfig, **kwargs):
    self.config = config
    self.kwargs = kwargs
    self.kwargs_original = {}
    self.kwargs_delete = set()

MultiModalDummyInputsBuilder

Bases: BaseDummyInputsBuilder[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers.py
class MultiModalDummyInputsBuilder(
        BaseDummyInputsBuilder[MultiModalProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

        processor = self.info.get_hf_processor()
        if "gemma3" in processor.__class__.__name__.lower():
            image_token = processor.boi_token
        else:
            image_token = getattr(processor, "image_token", "")
        return image_token * num_images

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width, target_height = self.info.get_max_image_size()

        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images),
        }

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict
Source code in vllm/model_executor/models/transformers.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)

    target_width, target_height = self.info.get_max_image_size()

    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images),
    }

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/transformers.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)

    processor = self.info.get_hf_processor()
    if "gemma3" in processor.__class__.__name__.lower():
        image_token = processor.boi_token
    else:
        image_token = getattr(processor, "image_token", "")
    return image_token * num_images

MultiModalProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/transformers.py
class MultiModalProcessingInfo(BaseProcessingInfo):

    def get_hf_config(self):
        return self.ctx.model_config.hf_config

    def get_supported_mm_limits(self):
        return {"image": None}

    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
        return {"image": self.get_max_image_tokens()}

    def get_max_image_tokens(self) -> int:
        width, height = self.get_max_image_size()
        processor = self.get_hf_processor()
        mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
        mm_tokens = processor._get_num_multimodal_tokens(
            image_sizes=([height, width], ), **mm_processor_kwargs)
        image_tokens = mm_tokens["num_image_tokens"][0]
        return image_tokens

    def get_hf_processor(self):
        processor = cached_get_processor(self.ctx.model_config.model)
        return processor

    def get_max_image_size(self):
        return 10_000, 10_000  # hardcode for arbitrary very large size

get_hf_config

get_hf_config()
Source code in vllm/model_executor/models/transformers.py
def get_hf_config(self):
    return self.ctx.model_config.hf_config

get_hf_processor

get_hf_processor()
Source code in vllm/model_executor/models/transformers.py
def get_hf_processor(self):
    processor = cached_get_processor(self.ctx.model_config.model)
    return processor

get_max_image_size

get_max_image_size()
Source code in vllm/model_executor/models/transformers.py
def get_max_image_size(self):
    return 10_000, 10_000  # hardcode for arbitrary very large size

get_max_image_tokens

get_max_image_tokens() -> int
Source code in vllm/model_executor/models/transformers.py
def get_max_image_tokens(self) -> int:
    width, height = self.get_max_image_size()
    processor = self.get_hf_processor()
    mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
    mm_tokens = processor._get_num_multimodal_tokens(
        image_sizes=([height, width], ), **mm_processor_kwargs)
    image_tokens = mm_tokens["num_image_tokens"][0]
    return image_tokens

get_mm_max_tokens_per_item

get_mm_max_tokens_per_item(seq_len, mm_counts)
Source code in vllm/model_executor/models/transformers.py
def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
    return {"image": self.get_max_image_tokens()}

get_supported_mm_limits

get_supported_mm_limits()
Source code in vllm/model_executor/models/transformers.py
def get_supported_mm_limits(self):
    return {"image": None}

MultiModalProcessor

Bases: BaseMultiModalProcessor[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers.py
class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ):
        """
        Given the original multi-modal items for this modality
        and HF-processed data, output the updates to perform.

        The information returned by this method is used to update token inputs
        which bypass the HF processor. It is also used to update the output of
        HF processor if the HF process does not apply prompt updates to text
        inputs.

        Moreover, this information is critical to determine the token positions
        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
        for each multi-modal item.
        """
        return None

    def _get_mm_fields_config(
        self,
        hf_inputs,
        hf_processor_mm_kwargs,
        num_image_patches: torch.Tensor = None,
    ):
        # HF Processors always return a mask but vLLM doesn't need it
        hf_inputs.pop("attention_mask", None)
        mm_fields = {
            key: MultiModalFieldConfig.flat_from_sizes("image",
                                                       num_image_patches)
            for key in hf_inputs
        }
        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
            "image", num_image_patches)
        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
        return mm_fields

    def _apply_hf_processor_text_mm(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ):
        """
        Apply the HF processor on the prompt text and multi-modal data
        together.

        In addition, return whether prompt replacements have been applied.
        """
        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
        processor_data["return_mm_token_type_ids"] = True

        processed_data = self._call_hf_processor(
            prompt=prompt_text,
            mm_data=processor_data,
            mm_kwargs=hf_processor_mm_kwargs,
            tok_kwargs=tokenization_kwargs,
        )
        processed_data.update(passthrough_data)

        prompt_ids, = processed_data.pop("input_ids").tolist()
        mm_token_type_ids = processed_data.pop(
            "mm_token_type_ids"
        ) if "mm_token_type_ids" in processed_data else processed_data.pop(
            "token_type_ids")  # for gemma3 only

        return prompt_ids, processed_data, mm_token_type_ids

    def apply(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Optional[Mapping[str, object]] = None,
        return_mm_hashes: bool = False,
    ) -> MultiModalInputs:
        """
        Process multi-modal inputs to be used in vLLM.

        Apply HF Processor on prompt text and multi-modal data together,
        outputting token IDs and processed tensors.
        """
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

        mm_items = self._to_mm_items(mm_data)
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        if not isinstance(prompt, str):
            # the prompt is the tokenized ids which is not supported
            # by the hf_processor, which is why we would need to decode the ids
            # into string
            prompt = hf_processor.decode(prompt)

        (prompt_ids, processed_data,
         mm_token_type_ids) = self._apply_hf_processor_text_mm(
             prompt_text=prompt,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
         )

        # HF processor will return `mm_token_type_ids` from which
        # we can infer mm_placeholders. Until then hardcode to make code run
        # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
        mm_positions = torch.where(mm_token_type_ids == 1)[1]
        images = mm_items.get_items("image", ImageProcessorItems)
        mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
                               or {})
        image_sizes = []
        for item_idx in range(len(images)):
            image_size = images.get_image_size(item_idx)
            image_sizes.append((image_size.height, image_size.width))

        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
            image_sizes=image_sizes, **mm_processor_kwargs)

        mm_placeholders = {}
        split_sizes = mm_tokens_per_modality["num_image_tokens"]
        if split_sizes:
            chunked_mm_positions = torch.split(mm_positions, split_sizes)
            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
            ranges = [
                PlaceholderRange(
                    offset=positions[0].item(),
                    length=positions.shape[0],
                    is_embed=(mm_tokens == hf_processor.image_token_id).bool())
                for positions, mm_tokens in zip(chunked_mm_positions,
                                                chunked_mm_tokens)
            ]
            mm_placeholders = {"image": ranges}

        num_image_patches = torch.tensor(
            mm_tokens_per_modality["num_image_patches"]
        ) if "num_image_patches" in mm_tokens_per_modality else None
        processed_data['num_image_patches'] = num_image_patches
        mm_kwargs = MultiModalKwargs.from_hf_inputs(
            processed_data,
            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                       num_image_patches),
        )

        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
                                        tokenization_kwargs)
        return MultiModalInputs(
            type="multimodal",
            prompt=prompt,
            prompt_token_ids=prompt_ids,
            mm_kwargs=mm_kwargs,
            mm_hashes=mm_hashes,
            mm_placeholders=mm_placeholders,
        )

_apply_hf_processor_text_mm

_apply_hf_processor_text_mm(
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
)

Apply the HF processor on the prompt text and multi-modal data together.

In addition, return whether prompt replacements have been applied.

Source code in vllm/model_executor/models/transformers.py
def _apply_hf_processor_text_mm(
    self,
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
):
    """
    Apply the HF processor on the prompt text and multi-modal data
    together.

    In addition, return whether prompt replacements have been applied.
    """
    processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
    processor_data["return_mm_token_type_ids"] = True

    processed_data = self._call_hf_processor(
        prompt=prompt_text,
        mm_data=processor_data,
        mm_kwargs=hf_processor_mm_kwargs,
        tok_kwargs=tokenization_kwargs,
    )
    processed_data.update(passthrough_data)

    prompt_ids, = processed_data.pop("input_ids").tolist()
    mm_token_type_ids = processed_data.pop(
        "mm_token_type_ids"
    ) if "mm_token_type_ids" in processed_data else processed_data.pop(
        "token_type_ids")  # for gemma3 only

    return prompt_ids, processed_data, mm_token_type_ids

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs,
    hf_processor_mm_kwargs,
    num_image_patches: Tensor = None,
)
Source code in vllm/model_executor/models/transformers.py
def _get_mm_fields_config(
    self,
    hf_inputs,
    hf_processor_mm_kwargs,
    num_image_patches: torch.Tensor = None,
):
    # HF Processors always return a mask but vLLM doesn't need it
    hf_inputs.pop("attention_mask", None)
    mm_fields = {
        key: MultiModalFieldConfig.flat_from_sizes("image",
                                                   num_image_patches)
        for key in hf_inputs
    }
    mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
        "image", num_image_patches)
    mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")
    return mm_fields

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
)

Given the original multi-modal items for this modality and HF-processed data, output the updates to perform.

The information returned by this method is used to update token inputs which bypass the HF processor. It is also used to update the output of HF processor if the HF process does not apply prompt updates to text inputs.

Moreover, this information is critical to determine the token positions in order to construct :class:~vllm-multimodal.input.PlaceholderRange for each multi-modal item.

Source code in vllm/model_executor/models/transformers.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
):
    """
    Given the original multi-modal items for this modality
    and HF-processed data, output the updates to perform.

    The information returned by this method is used to update token inputs
    which bypass the HF processor. It is also used to update the output of
    HF processor if the HF process does not apply prompt updates to text
    inputs.

    Moreover, this information is critical to determine the token positions
    in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
    for each multi-modal item.
    """
    return None

apply

apply(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Optional[
        Mapping[str, object]
    ] = None,
    return_mm_hashes: bool = False,
) -> MultiModalInputs

Process multi-modal inputs to be used in vLLM.

Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors.

Source code in vllm/model_executor/models/transformers.py
def apply(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Optional[Mapping[str, object]] = None,
    return_mm_hashes: bool = False,
) -> MultiModalInputs:
    """
    Process multi-modal inputs to be used in vLLM.

    Apply HF Processor on prompt text and multi-modal data together,
    outputting token IDs and processed tensors.
    """
    if tokenization_kwargs is None:
        tokenization_kwargs = {}

    mm_items = self._to_mm_items(mm_data)
    hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
    if not isinstance(prompt, str):
        # the prompt is the tokenized ids which is not supported
        # by the hf_processor, which is why we would need to decode the ids
        # into string
        prompt = hf_processor.decode(prompt)

    (prompt_ids, processed_data,
     mm_token_type_ids) = self._apply_hf_processor_text_mm(
         prompt_text=prompt,
         mm_items=mm_items,
         hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         tokenization_kwargs=tokenization_kwargs,
     )

    # HF processor will return `mm_token_type_ids` from which
    # we can infer mm_placeholders. Until then hardcode to make code run
    # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
    mm_positions = torch.where(mm_token_type_ids == 1)[1]
    images = mm_items.get_items("image", ImageProcessorItems)
    mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
                           or {})
    image_sizes = []
    for item_idx in range(len(images)):
        image_size = images.get_image_size(item_idx)
        image_sizes.append((image_size.height, image_size.width))

    mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
        image_sizes=image_sizes, **mm_processor_kwargs)

    mm_placeholders = {}
    split_sizes = mm_tokens_per_modality["num_image_tokens"]
    if split_sizes:
        chunked_mm_positions = torch.split(mm_positions, split_sizes)
        mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()]
        chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
        ranges = [
            PlaceholderRange(
                offset=positions[0].item(),
                length=positions.shape[0],
                is_embed=(mm_tokens == hf_processor.image_token_id).bool())
            for positions, mm_tokens in zip(chunked_mm_positions,
                                            chunked_mm_tokens)
        ]
        mm_placeholders = {"image": ranges}

    num_image_patches = torch.tensor(
        mm_tokens_per_modality["num_image_patches"]
    ) if "num_image_patches" in mm_tokens_per_modality else None
    processed_data['num_image_patches'] = num_image_patches
    mm_kwargs = MultiModalKwargs.from_hf_inputs(
        processed_data,
        self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                   num_image_patches),
    )

    mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
                                    tokenization_kwargs)
    return MultiModalInputs(
        type="multimodal",
        prompt=prompt,
        prompt_token_ids=prompt_ids,
        mm_kwargs=mm_kwargs,
        mm_hashes=mm_hashes,
        mm_placeholders=mm_placeholders,
    )

TransformersForCausalLM

Bases: Module, SupportsQuant, SupportsLoRA, SupportsPP

Source code in vllm/model_executor/models/transformers.py
@support_torch_compile
class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
                              SupportsPP):
    embedding_padding_modules = ["lm_head"]
    embedding_modules = ["embed_tokens"
                         ]  # TODO transformers will have a util to get it

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config: PretrainedConfig = vllm_config.model_config.hf_config
        quant_config: QuantizationConfig = vllm_config.quant_config

        self.config = config

        self.transformers_model = TransformersModel(vllm_config=vllm_config,
                                                    prefix=prefix)
        self.model = self.transformers_model.model

        if get_pp_group().is_last_rank:
            self.unpadded_vocab_size = config.vocab_size
            self.lm_head = ParallelLMHead(
                config.vocab_size,
                config.hidden_size,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
            if config.tie_word_embeddings:
                self.lm_head = self.lm_head.tie_weights(
                    self.model.get_input_embeddings())

            logit_scale = getattr(config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                    config.vocab_size,
                                                    logit_scale)
        else:
            self.lm_head = PPMissingLayer()

        self.make_empty_intermediate_tensors = (
            self.transformers_model.make_empty_intermediate_tensors)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        model_output = self.transformers_model.forward(input_ids, positions,
                                                       intermediate_tensors,
                                                       inputs_embeds)
        return model_output

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        skip_prefixes = ["lm_head."
                         ] if self.config.tie_word_embeddings else None
        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
        return loader.load_weights(weights)

config instance-attribute

config = config

embedding_modules class-attribute instance-attribute

embedding_modules = ['embed_tokens']

embedding_padding_modules class-attribute instance-attribute

embedding_padding_modules = ['lm_head']

lm_head instance-attribute

lm_head = ParallelLMHead(
    vocab_size,
    hidden_size,
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "lm_head"),
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size, logit_scale
)

make_empty_intermediate_tensors instance-attribute

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

model instance-attribute

model = model

transformers_model instance-attribute

transformers_model = TransformersModel(
    vllm_config=vllm_config, prefix=prefix
)

unpadded_vocab_size instance-attribute

unpadded_vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/transformers.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config: PretrainedConfig = vllm_config.model_config.hf_config
    quant_config: QuantizationConfig = vllm_config.quant_config

    self.config = config

    self.transformers_model = TransformersModel(vllm_config=vllm_config,
                                                prefix=prefix)
    self.model = self.transformers_model.model

    if get_pp_group().is_last_rank:
        self.unpadded_vocab_size = config.vocab_size
        self.lm_head = ParallelLMHead(
            config.vocab_size,
            config.hidden_size,
            quant_config=quant_config,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
        if config.tie_word_embeddings:
            self.lm_head = self.lm_head.tie_weights(
                self.model.get_input_embeddings())

        logit_scale = getattr(config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                config.vocab_size,
                                                logit_scale)
    else:
        self.lm_head = PPMissingLayer()

    self.make_empty_intermediate_tensors = (
        self.transformers_model.make_empty_intermediate_tensors)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/transformers.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/transformers.py
def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    model_output = self.transformers_model.forward(input_ids, positions,
                                                   intermediate_tensors,
                                                   inputs_embeds)
    return model_output

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/transformers.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    skip_prefixes = ["lm_head."
                     ] if self.config.tie_word_embeddings else None
    loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
    return loader.load_weights(weights)

TransformersForMultimodalLM

Bases: Module, SupportsQuant, SupportsLoRA, SupportsPP, SupportsMultiModal

Source code in vllm/model_executor/models/transformers.py
@MULTIMODAL_REGISTRY.register_processor(
    MultiModalProcessor,
    info=MultiModalProcessingInfo,
    dummy_inputs=MultiModalDummyInputsBuilder)
class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA,
                                  SupportsPP, SupportsMultiModal):
    embedding_padding_modules = ["lm_head"]
    embedding_modules = ["embed_tokens"]

    # Backwards compatibility for prev released models. State dicts back then
    # had different formats and cannot be loaded with `AutoModel` mapping as is
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "language_model.model": "model.language_model",
            "text_model.model": "model.text_model",
            "vision_tower": "model.vision_tower",
            "vqmodel": "model.vqmodel",
            "visual": "model.visual",
            "vision_model": "model.vision_model",
            "vision_embed_tokens": "model.vision_embed_tokens",
            "image_newline": "model.image_newline",
            "multi_modal_projector": "model.multi_modal_projector",
            "text_model.lm_head": "lm_head",
            "language_model.lm_head": "lm_head",
            # Qwen models used "model" as the name for the language model.
            # Therefore, we must map each of submodule explicitly to avoid
            # conflicts with newer models that use "model.language_model".
            "model.embed_tokens": "model.language_model.embed_tokens",
            "model.layers": "model.language_model.layers",
            "model.norm": "model.language_model.norm",
        })

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config: PretrainedConfig = vllm_config.model_config.hf_config
        quant_config: QuantizationConfig = vllm_config.quant_config

        self.config = config
        self.dtype = vllm_config.model_config.dtype

        self.transformers_model = TransformersModel(vllm_config=vllm_config,
                                                    prefix=prefix)
        self.model = self.transformers_model.model
        text_config = config.get_text_config()

        if get_pp_group().is_last_rank:
            self.unpadded_vocab_size = text_config.vocab_size
            self.lm_head = ParallelLMHead(
                text_config.vocab_size,
                text_config.hidden_size,
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
            if text_config.tie_word_embeddings:
                self.lm_head = self.lm_head.tie_weights(
                    self.model.get_input_embeddings())

            logit_scale = getattr(config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                    text_config.vocab_size,
                                                    logit_scale)
        else:
            self.lm_head = PPMissingLayer()

        self.make_empty_intermediate_tensors = (
            self.transformers_model.make_empty_intermediate_tensors)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        # NOTE: In v1, inputs_embeds is always generated at model runner from
        # `get_multimodal_embeddings` and `get_input_embeddings`, this
        # condition is only for v0 compatibility.
        if inputs_embeds is None:
            multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
            if multimodal_embeds is not None:
                inputs_embeds = self.get_input_embeddings(
                    input_ids, multimodal_embeds)
                input_ids = None

        model_output = self.transformers_model.forward(input_ids, positions,
                                                       intermediate_tensors,
                                                       inputs_embeds)
        return model_output

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(
            self,
            skip_prefixes=([
                "lm_head."
            ] if self.config.get_text_config().tie_word_embeddings else None),
        )
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

    def get_multimodal_embeddings(self, **kwargs):
        pixel_values = kwargs.pop("pixel_values", None)
        pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
            "image_patches", None)
        image_embeds = kwargs.pop("image_embeds", None)

        if image_embeds is not None:
            return image_embeds

        if pixel_values is None and image_embeds is None:
            return None

        num_image_patches = kwargs.pop("num_image_patches")
        if pixel_values is not None:
            if isinstance(pixel_values, torch.Tensor):
                pixel_values = flatten_bn(pixel_values).to(self.dtype)
            elif is_list_of(pixel_values, torch.Tensor):
                pixel_values = flatten_bn(flatten_bn(pixel_values),
                                          concat=True).to(self.dtype)
            else:
                raise ValueError(
                    f"Unsupported pixel_values type {type(pixel_values)}. "
                    "Expected `torch.Tensor` or list of `torch.Tensor`.")

            if isinstance(num_image_patches, list):
                num_image_patches = torch.cat(num_image_patches)

            vision_embeddings = self.model.get_image_features(
                pixel_values,
                **{
                    k: v.flatten(0, 1)
                    for k, v in kwargs.items()
                },
            )

            if isinstance(vision_embeddings, torch.Tensor):
                if vision_embeddings.ndim == 2:
                    vision_embeddings = vision_embeddings.unsqueeze(0)

                # Embeddings have to be 2D tensors of length `num_images`
                # but transformers returns concat tensors if each patch
                # is of different size. We split it back to make vLLM happy
                vision_embeddings = torch.split(
                    vision_embeddings,
                    num_image_patches.flatten().tolist())
                vision_embeddings = [
                    embed.flatten(start_dim=0, end_dim=-2)
                    for embed in vision_embeddings
                ]

            return vision_embeddings

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings=None,
    ) -> torch.Tensor:
        inputs_embeds = self.model.get_input_embeddings()(input_ids)
        if (multimodal_embeddings is not None
                and len(multimodal_embeddings) != 0):
            mask = (input_ids == self.config.image_token_id)
            mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
            multimodal_embeddings = torch.cat(multimodal_embeddings)

            inputs_embeds = inputs_embeds.masked_scatter(
                mask, multimodal_embeddings)
        return inputs_embeds

config instance-attribute

config = config

dtype instance-attribute

dtype = dtype

embedding_modules class-attribute instance-attribute

embedding_modules = ['embed_tokens']

embedding_padding_modules class-attribute instance-attribute

embedding_padding_modules = ['lm_head']

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "language_model.model": "model.language_model",
        "text_model.model": "model.text_model",
        "vision_tower": "model.vision_tower",
        "vqmodel": "model.vqmodel",
        "visual": "model.visual",
        "vision_model": "model.vision_model",
        "vision_embed_tokens": "model.vision_embed_tokens",
        "image_newline": "model.image_newline",
        "multi_modal_projector": "model.multi_modal_projector",
        "text_model.lm_head": "lm_head",
        "language_model.lm_head": "lm_head",
        "model.embed_tokens": "model.language_model.embed_tokens",
        "model.layers": "model.language_model.layers",
        "model.norm": "model.language_model.norm",
    }
)

lm_head instance-attribute

lm_head = ParallelLMHead(
    vocab_size,
    hidden_size,
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "lm_head"),
)

logits_processor instance-attribute

logits_processor = LogitsProcessor(
    unpadded_vocab_size, vocab_size, logit_scale
)

make_empty_intermediate_tensors instance-attribute

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

model instance-attribute

model = model

transformers_model instance-attribute

transformers_model = TransformersModel(
    vllm_config=vllm_config, prefix=prefix
)

unpadded_vocab_size instance-attribute

unpadded_vocab_size = vocab_size

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/transformers.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config: PretrainedConfig = vllm_config.model_config.hf_config
    quant_config: QuantizationConfig = vllm_config.quant_config

    self.config = config
    self.dtype = vllm_config.model_config.dtype

    self.transformers_model = TransformersModel(vllm_config=vllm_config,
                                                prefix=prefix)
    self.model = self.transformers_model.model
    text_config = config.get_text_config()

    if get_pp_group().is_last_rank:
        self.unpadded_vocab_size = text_config.vocab_size
        self.lm_head = ParallelLMHead(
            text_config.vocab_size,
            text_config.hidden_size,
            quant_config=quant_config,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
        if text_config.tie_word_embeddings:
            self.lm_head = self.lm_head.tie_weights(
                self.model.get_input_embeddings())

        logit_scale = getattr(config, "logit_scale", 1.0)
        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                text_config.vocab_size,
                                                logit_scale)
    else:
        self.lm_head = PPMissingLayer()

    self.make_empty_intermediate_tensors = (
        self.transformers_model.make_empty_intermediate_tensors)

compute_logits

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]
Source code in vllm/model_executor/models/transformers.py
def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs: object,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/transformers.py
def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
    # NOTE: In v1, inputs_embeds is always generated at model runner from
    # `get_multimodal_embeddings` and `get_input_embeddings`, this
    # condition is only for v0 compatibility.
    if inputs_embeds is None:
        multimodal_embeds = self.get_multimodal_embeddings(**kwargs)
        if multimodal_embeds is not None:
            inputs_embeds = self.get_input_embeddings(
                input_ids, multimodal_embeds)
            input_ids = None

    model_output = self.transformers_model.forward(input_ids, positions,
                                                   intermediate_tensors,
                                                   inputs_embeds)
    return model_output

get_input_embeddings

get_input_embeddings(
    input_ids: Tensor, multimodal_embeddings=None
) -> Tensor
Source code in vllm/model_executor/models/transformers.py
def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings=None,
) -> torch.Tensor:
    inputs_embeds = self.model.get_input_embeddings()(input_ids)
    if (multimodal_embeddings is not None
            and len(multimodal_embeddings) != 0):
        mask = (input_ids == self.config.image_token_id)
        mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
        multimodal_embeddings = torch.cat(multimodal_embeddings)

        inputs_embeds = inputs_embeds.masked_scatter(
            mask, multimodal_embeddings)
    return inputs_embeds

get_multimodal_embeddings

get_multimodal_embeddings(**kwargs)
Source code in vllm/model_executor/models/transformers.py
def get_multimodal_embeddings(self, **kwargs):
    pixel_values = kwargs.pop("pixel_values", None)
    pixel_values = pixel_values if pixel_values is not None else kwargs.pop(
        "image_patches", None)
    image_embeds = kwargs.pop("image_embeds", None)

    if image_embeds is not None:
        return image_embeds

    if pixel_values is None and image_embeds is None:
        return None

    num_image_patches = kwargs.pop("num_image_patches")
    if pixel_values is not None:
        if isinstance(pixel_values, torch.Tensor):
            pixel_values = flatten_bn(pixel_values).to(self.dtype)
        elif is_list_of(pixel_values, torch.Tensor):
            pixel_values = flatten_bn(flatten_bn(pixel_values),
                                      concat=True).to(self.dtype)
        else:
            raise ValueError(
                f"Unsupported pixel_values type {type(pixel_values)}. "
                "Expected `torch.Tensor` or list of `torch.Tensor`.")

        if isinstance(num_image_patches, list):
            num_image_patches = torch.cat(num_image_patches)

        vision_embeddings = self.model.get_image_features(
            pixel_values,
            **{
                k: v.flatten(0, 1)
                for k, v in kwargs.items()
            },
        )

        if isinstance(vision_embeddings, torch.Tensor):
            if vision_embeddings.ndim == 2:
                vision_embeddings = vision_embeddings.unsqueeze(0)

            # Embeddings have to be 2D tensors of length `num_images`
            # but transformers returns concat tensors if each patch
            # is of different size. We split it back to make vLLM happy
            vision_embeddings = torch.split(
                vision_embeddings,
                num_image_patches.flatten().tolist())
            vision_embeddings = [
                embed.flatten(start_dim=0, end_dim=-2)
                for embed in vision_embeddings
            ]

        return vision_embeddings

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/transformers.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    loader = AutoWeightsLoader(
        self,
        skip_prefixes=([
            "lm_head."
        ] if self.config.get_text_config().tie_word_embeddings else None),
    )
    return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

TransformersModel

Source code in vllm/model_executor/models/transformers.py
class TransformersModel:

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        logger.info("Using Transformers backend.")

        config: PretrainedConfig = vllm_config.model_config.hf_config
        cache_config: CacheConfig = vllm_config.cache_config
        device_config: DeviceConfig = vllm_config.device_config
        model_config: ModelConfig = vllm_config.model_config
        parallel_config: ParallelConfig = vllm_config.parallel_config
        quant_config: QuantizationConfig = vllm_config.quant_config

        self.config = config
        self.text_config = config.get_text_config()
        self.cache_config = cache_config
        self.device_config = device_config
        self.model_config = model_config
        self.parallel_config = parallel_config
        self.quant_config = quant_config

        self.pp_group = get_pp_group()
        self.pp_size = self.pp_group.world_size
        self.pp_rank = self.pp_group.rank_in_group
        self.tp_size = get_tensor_model_parallel_world_size()

        # vLLM handles interleaved sliding window attention by creating a new
        # interleaved_sliding_window attribute and deleting the sliding_window
        # attribute. This breaks the constructors in Transformers so we
        # temporarily add the attribute back to construct the model.
        config_override = nullcontext()
        if hasattr(config, "interleaved_sliding_window"):
            config_override = ConfigOverride(
                config, sliding_window=config.interleaved_sliding_window)

        # Set correct attn and init on "meta" to delay allocating GPU tensors
        # TODO: @raushan, use the public `model.set_attn_implementation()`
        # method after v4.54.0 is released
        self.text_config._attn_implementation = "vllm"
        with init_on_device_without_buffers("meta"), config_override:
            self.model: PreTrainedModel = AutoModel.from_config(
                config,
                torch_dtype=model_config.dtype,
                trust_remote_code=model_config.trust_remote_code,
            )

        self.pipeline_parallel()
        self.tensor_parallel()

        # Input embeddings
        text_config = config.get_text_config()
        if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
            self.model.set_input_embeddings(
                VocabParallelEmbedding(
                    text_config.vocab_size,
                    text_config.hidden_size,
                    org_num_embeddings=text_config.vocab_size,
                    quant_config=quant_config,
                ))

        # Attention layers
        self.attention_instances = self.create_attention_instances()

        # Initialize any parameters that have not had their modules replaced
        self.init_parameters(self.model)

        self.make_empty_intermediate_tensors = (
            make_empty_intermediate_tensors_factory(["hidden_states"],
                                                    text_config.hidden_size))

    def pipeline_parallel(self):
        """
        Apply the model's pipeline parallelization plan.
        """
        if self.pp_size <= 1:
            return

        if not self.model.supports_pp_plan:
            raise ValueError(
                f"{type(self.model)} does not support pipeline parallel yet!")

        module_lists = []
        module_list_idx = None
        pp_plan = list(self.model._pp_plan.keys())
        for i, name in enumerate(pp_plan):
            if isinstance(getattr(self.model, name), nn.ModuleList):
                module_lists.append(name)
                module_list_idx = i

        if len(module_lists) > 1:
            raise ValueError(
                "Pipeline parallel of models with multiple `ModuleList`s "
                "in the base model are not supported yet!")
        if module_list_idx is None:
            raise ValueError(
                f"Could not find `ModuleList` in {type(self.model)}")

        # Layers before module list
        for name in pp_plan[:module_list_idx]:
            if self.pp_group.is_first_rank or (
                    self.text_config.tie_word_embeddings
                    and self.pp_group.is_last_rank):
                continue
            setattr(self.model, name, PPMissingLayer())

        # Module list
        start_layer, end_layer = get_pp_indices(
            self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
        layers_name = pp_plan[module_list_idx]
        layers = getattr(self.model, layers_name)
        for i in range(len(layers)):
            if start_layer <= i and i < end_layer:
                continue
            layers[i] = PPMissingLayer(return_tuple=True)

        # Layers after module list
        for name in pp_plan[module_list_idx + 1:]:
            # Modules that should be on last rank
            if not self.pp_group.is_last_rank:
                setattr(self.model, name, PPMissingLayer())

    def tensor_parallel(self):
        """
        Apply the model's tensor parallelization plan.
        Currently only supports linear layers.
        """
        if not self.model.supports_tp_plan:
            if self.tp_size <= 1:
                return

            raise ValueError(
                f"{type(self.model)} does not support tensor parallel yet!")

        tp_plan = self.model._tp_plan

        def _tensor_parallel(module: nn.Module, prefix: str = ""):
            for child_name, child_module in module.named_children():
                qual_name = maybe_prefix(prefix, child_name)
                for pattern, style in tp_plan.items():
                    if re.match(pattern, qual_name) and isinstance(
                            child_module, nn.Linear):
                        new_module = replace_linear_class(
                            child_module, style, self.quant_config)
                        setattr(module, child_name, new_module)
                        log_replacement(qual_name, child_module, new_module)
                else:
                    _tensor_parallel(child_module, prefix=qual_name)

        _tensor_parallel(self.model)

    def create_attention_instances(self) -> dict[int, Attention]:
        """
        Create `Attention` instances to inform KV cache allocation.
        """
        num_heads = self.model_config.get_num_attention_heads(
            self.parallel_config)
        head_size = self.model_config.get_head_size()
        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
        start, end = get_pp_indices(self.text_config.num_hidden_layers,
                                    self.pp_rank, self.pp_size)

        attention_instances = {}
        for i in range(start, end):
            # Handle interleaved sliding window attention
            sliding_window = None
            if (hasattr(self.config, "interleaved_sliding_window")
                    and hasattr(self.config, "sliding_window_pattern")
                    and ((i + 1) % self.config.sliding_window_pattern > 0)):
                sliding_window = self.config.interleaved_sliding_window

            attention_instances[i] = Attention(
                num_heads=num_heads,
                head_size=head_size,
                # NOTE: We use Llama scale as default, if it's set by
                # Transformers, it's updated in vllm_flash_attention_forward
                scale=head_size**-0.5,
                num_kv_heads=num_kv_heads,
                cache_config=self.cache_config,
                quant_config=self.quant_config,
                per_layer_sliding_window=sliding_window,
                prefix=f"{i}.attn")
        return attention_instances

    def init_parameters(self, module: nn.Module):
        """
        If a `parameter` is on the `meta` device, then its parent
        `module` is the original module created by:

        ```python
        with torch.device("meta"):
            self.model: PreTrainedModel = AutoModel.from_config(...)
        ```
        """
        for name, param in module.named_parameters(recurse=False):
            if param.device == torch.device("meta"):
                new_param = nn.Parameter(
                    torch.empty_like(param.data,
                                     dtype=self.model_config.dtype,
                                     device=self.device_config.device))
                setattr(module, name, new_param)
        for child in module.children():
            self.init_parameters(child)

    def forward(
        self,
        input_ids: Optional[torch.Tensor],
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if not get_pp_group().is_first_rank:
            assert intermediate_tensors is not None
            input_ids = None
            inputs_embeds = intermediate_tensors["hidden_states"]

        if input_ids is not None:
            input_ids = input_ids[None, ...]
        if inputs_embeds is not None:
            inputs_embeds = inputs_embeds[None, ...]

        if self.model_config.uses_mrope:
            position_ids = positions[:, None]
        else:
            position_ids = positions[None, ...]

        hidden_states = self.model(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            use_cache=False,
            position_ids=position_ids,
            attention_instances=self.attention_instances,
            return_dict=False)[0][0, ...]  # we remove batch dimension for now

        if not get_pp_group().is_last_rank:
            return IntermediateTensors({"hidden_states": hidden_states})

        return hidden_states

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        params_dict = dict(self.named_parameters())

        loaded_params = set[str]()
        for name, loaded_weight in weights:
            # Use "model" instead of base_model_prefix because
            # the base model attribute in vLLM is always `model`
            if not name.startswith(prefix := "model."):
                name = prefix + name

            if is_pp_missing_parameter(name, self):
                continue
            if name in params_dict:
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
                loaded_params.add(name)
        return loaded_params

attention_instances instance-attribute

attention_instances = create_attention_instances()

cache_config instance-attribute

cache_config = cache_config

config instance-attribute

config = config

device_config instance-attribute

device_config = device_config

make_empty_intermediate_tensors instance-attribute

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors_factory(
        ["hidden_states"], hidden_size
    )
)

model instance-attribute

model: PreTrainedModel = from_config(
    config,
    torch_dtype=dtype,
    trust_remote_code=trust_remote_code,
)

model_config instance-attribute

model_config = model_config

parallel_config instance-attribute

parallel_config = parallel_config

pp_group instance-attribute

pp_group = get_pp_group()

pp_rank instance-attribute

pp_rank = rank_in_group

pp_size instance-attribute

pp_size = world_size

quant_config instance-attribute

quant_config = quant_config

text_config instance-attribute

text_config = get_text_config()

tp_size instance-attribute

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/transformers.py
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    logger.info("Using Transformers backend.")

    config: PretrainedConfig = vllm_config.model_config.hf_config
    cache_config: CacheConfig = vllm_config.cache_config
    device_config: DeviceConfig = vllm_config.device_config
    model_config: ModelConfig = vllm_config.model_config
    parallel_config: ParallelConfig = vllm_config.parallel_config
    quant_config: QuantizationConfig = vllm_config.quant_config

    self.config = config
    self.text_config = config.get_text_config()
    self.cache_config = cache_config
    self.device_config = device_config
    self.model_config = model_config
    self.parallel_config = parallel_config
    self.quant_config = quant_config

    self.pp_group = get_pp_group()
    self.pp_size = self.pp_group.world_size
    self.pp_rank = self.pp_group.rank_in_group
    self.tp_size = get_tensor_model_parallel_world_size()

    # vLLM handles interleaved sliding window attention by creating a new
    # interleaved_sliding_window attribute and deleting the sliding_window
    # attribute. This breaks the constructors in Transformers so we
    # temporarily add the attribute back to construct the model.
    config_override = nullcontext()
    if hasattr(config, "interleaved_sliding_window"):
        config_override = ConfigOverride(
            config, sliding_window=config.interleaved_sliding_window)

    # Set correct attn and init on "meta" to delay allocating GPU tensors
    # TODO: @raushan, use the public `model.set_attn_implementation()`
    # method after v4.54.0 is released
    self.text_config._attn_implementation = "vllm"
    with init_on_device_without_buffers("meta"), config_override:
        self.model: PreTrainedModel = AutoModel.from_config(
            config,
            torch_dtype=model_config.dtype,
            trust_remote_code=model_config.trust_remote_code,
        )

    self.pipeline_parallel()
    self.tensor_parallel()

    # Input embeddings
    text_config = config.get_text_config()
    if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
        self.model.set_input_embeddings(
            VocabParallelEmbedding(
                text_config.vocab_size,
                text_config.hidden_size,
                org_num_embeddings=text_config.vocab_size,
                quant_config=quant_config,
            ))

    # Attention layers
    self.attention_instances = self.create_attention_instances()

    # Initialize any parameters that have not had their modules replaced
    self.init_parameters(self.model)

    self.make_empty_intermediate_tensors = (
        make_empty_intermediate_tensors_factory(["hidden_states"],
                                                text_config.hidden_size))

create_attention_instances

create_attention_instances() -> dict[int, Attention]

Create Attention instances to inform KV cache allocation.

Source code in vllm/model_executor/models/transformers.py
def create_attention_instances(self) -> dict[int, Attention]:
    """
    Create `Attention` instances to inform KV cache allocation.
    """
    num_heads = self.model_config.get_num_attention_heads(
        self.parallel_config)
    head_size = self.model_config.get_head_size()
    num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
    start, end = get_pp_indices(self.text_config.num_hidden_layers,
                                self.pp_rank, self.pp_size)

    attention_instances = {}
    for i in range(start, end):
        # Handle interleaved sliding window attention
        sliding_window = None
        if (hasattr(self.config, "interleaved_sliding_window")
                and hasattr(self.config, "sliding_window_pattern")
                and ((i + 1) % self.config.sliding_window_pattern > 0)):
            sliding_window = self.config.interleaved_sliding_window

        attention_instances[i] = Attention(
            num_heads=num_heads,
            head_size=head_size,
            # NOTE: We use Llama scale as default, if it's set by
            # Transformers, it's updated in vllm_flash_attention_forward
            scale=head_size**-0.5,
            num_kv_heads=num_kv_heads,
            cache_config=self.cache_config,
            quant_config=self.quant_config,
            per_layer_sliding_window=sliding_window,
            prefix=f"{i}.attn")
    return attention_instances

forward

forward(
    input_ids: Optional[Tensor],
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    inputs_embeds: Optional[Tensor] = None,
) -> Union[Tensor, IntermediateTensors]
Source code in vllm/model_executor/models/transformers.py
def forward(
    self,
    input_ids: Optional[torch.Tensor],
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    if not get_pp_group().is_first_rank:
        assert intermediate_tensors is not None
        input_ids = None
        inputs_embeds = intermediate_tensors["hidden_states"]

    if input_ids is not None:
        input_ids = input_ids[None, ...]
    if inputs_embeds is not None:
        inputs_embeds = inputs_embeds[None, ...]

    if self.model_config.uses_mrope:
        position_ids = positions[:, None]
    else:
        position_ids = positions[None, ...]

    hidden_states = self.model(
        input_ids=input_ids,
        inputs_embeds=inputs_embeds,
        use_cache=False,
        position_ids=position_ids,
        attention_instances=self.attention_instances,
        return_dict=False)[0][0, ...]  # we remove batch dimension for now

    if not get_pp_group().is_last_rank:
        return IntermediateTensors({"hidden_states": hidden_states})

    return hidden_states

init_parameters

init_parameters(module: Module)

If a parameter is on the meta device, then its parent module is the original module created by:

with torch.device("meta"):
    self.model: PreTrainedModel = AutoModel.from_config(...)
Source code in vllm/model_executor/models/transformers.py
def init_parameters(self, module: nn.Module):
    """
    If a `parameter` is on the `meta` device, then its parent
    `module` is the original module created by:

    ```python
    with torch.device("meta"):
        self.model: PreTrainedModel = AutoModel.from_config(...)
    ```
    """
    for name, param in module.named_parameters(recurse=False):
        if param.device == torch.device("meta"):
            new_param = nn.Parameter(
                torch.empty_like(param.data,
                                 dtype=self.model_config.dtype,
                                 device=self.device_config.device))
            setattr(module, name, new_param)
    for child in module.children():
        self.init_parameters(child)

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/transformers.py
def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    params_dict = dict(self.named_parameters())

    loaded_params = set[str]()
    for name, loaded_weight in weights:
        # Use "model" instead of base_model_prefix because
        # the base model attribute in vLLM is always `model`
        if not name.startswith(prefix := "model."):
            name = prefix + name

        if is_pp_missing_parameter(name, self):
            continue
        if name in params_dict:
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            weight_loader(param, loaded_weight)
            loaded_params.add(name)
    return loaded_params

pipeline_parallel

pipeline_parallel()

Apply the model's pipeline parallelization plan.

Source code in vllm/model_executor/models/transformers.py
def pipeline_parallel(self):
    """
    Apply the model's pipeline parallelization plan.
    """
    if self.pp_size <= 1:
        return

    if not self.model.supports_pp_plan:
        raise ValueError(
            f"{type(self.model)} does not support pipeline parallel yet!")

    module_lists = []
    module_list_idx = None
    pp_plan = list(self.model._pp_plan.keys())
    for i, name in enumerate(pp_plan):
        if isinstance(getattr(self.model, name), nn.ModuleList):
            module_lists.append(name)
            module_list_idx = i

    if len(module_lists) > 1:
        raise ValueError(
            "Pipeline parallel of models with multiple `ModuleList`s "
            "in the base model are not supported yet!")
    if module_list_idx is None:
        raise ValueError(
            f"Could not find `ModuleList` in {type(self.model)}")

    # Layers before module list
    for name in pp_plan[:module_list_idx]:
        if self.pp_group.is_first_rank or (
                self.text_config.tie_word_embeddings
                and self.pp_group.is_last_rank):
            continue
        setattr(self.model, name, PPMissingLayer())

    # Module list
    start_layer, end_layer = get_pp_indices(
        self.text_config.num_hidden_layers, self.pp_rank, self.pp_size)
    layers_name = pp_plan[module_list_idx]
    layers = getattr(self.model, layers_name)
    for i in range(len(layers)):
        if start_layer <= i and i < end_layer:
            continue
        layers[i] = PPMissingLayer(return_tuple=True)

    # Layers after module list
    for name in pp_plan[module_list_idx + 1:]:
        # Modules that should be on last rank
        if not self.pp_group.is_last_rank:
            setattr(self.model, name, PPMissingLayer())

tensor_parallel

tensor_parallel()

Apply the model's tensor parallelization plan. Currently only supports linear layers.

Source code in vllm/model_executor/models/transformers.py
def tensor_parallel(self):
    """
    Apply the model's tensor parallelization plan.
    Currently only supports linear layers.
    """
    if not self.model.supports_tp_plan:
        if self.tp_size <= 1:
            return

        raise ValueError(
            f"{type(self.model)} does not support tensor parallel yet!")

    tp_plan = self.model._tp_plan

    def _tensor_parallel(module: nn.Module, prefix: str = ""):
        for child_name, child_module in module.named_children():
            qual_name = maybe_prefix(prefix, child_name)
            for pattern, style in tp_plan.items():
                if re.match(pattern, qual_name) and isinstance(
                        child_module, nn.Linear):
                    new_module = replace_linear_class(
                        child_module, style, self.quant_config)
                    setattr(module, child_name, new_module)
                    log_replacement(qual_name, child_module, new_module)
            else:
                _tensor_parallel(child_module, prefix=qual_name)

    _tensor_parallel(self.model)

init_on_device_without_buffers

init_on_device_without_buffers(device: device)

A context manager under which models are initialized with all parameters on the specified device. However buffers are not initialized on specified device.

Parameters:

Name Type Description Default
device `torch.device`

Device to initialize all parameters on.

required
Source code in vllm/model_executor/models/transformers.py
@contextmanager
def init_on_device_without_buffers(device: torch.device):
    """
    A context manager under which models are initialized with all
    parameters on the specified device. However buffers are not
    initialized on specified device.

    Args:
        device (`torch.device`):
            Device to initialize all parameters on.
    """

    old_register_parameter = nn.Module.register_parameter

    def register_empty_parameter(module, name, param):
        old_register_parameter(module, name, param)
        if param is not None:
            param_cls = type(module._parameters[name])
            kwargs = module._parameters[name].__dict__
            kwargs["requires_grad"] = param.requires_grad
            module._parameters[name] = param_cls(
                module._parameters[name].to(device), **kwargs)

    tensor_constructors_to_patch = {}

    def patch_tensor_constructor(fn):

        def wrapper(*args, **kwargs):
            kwargs["device"] = device
            return fn(*args, **kwargs)

        return wrapper

    try:
        nn.Module.register_parameter = register_empty_parameter
        for torch_function_name in tensor_constructors_to_patch:
            setattr(
                torch, torch_function_name,
                patch_tensor_constructor(getattr(torch, torch_function_name)))
        yield
    finally:
        nn.Module.register_parameter = old_register_parameter
        for torch_function_name, old_torch_function in (
                tensor_constructors_to_patch.items()):
            setattr(torch, torch_function_name, old_torch_function)

log_replacement

log_replacement(
    name: str, old_module: Module, new_module: Module
)
Source code in vllm/model_executor/models/transformers.py
def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
    logger.debug("%s: %s -> %s", name, old_module, new_module)

replace_linear_class

replace_linear_class(
    linear: Linear,
    style: Literal["colwise", "rowwise"],
    quant_config: QuantizationConfig,
) -> Union[ColumnParallelLinear, RowParallelLinear]

Replace nn.Linear with one of vLLM's tensor parallel linear classes.

Parameters:

Name Type Description Default
linear Linear

nn.Linear to be replaced.

required
style str

Tensor parallel style of the new linear, e.g. "colwise".

required
quant_config QuantConfig

Quantization config for the new linear.

required

Returns: Union[ColumnParallelLinear, RowParallelLinear]: The new linear.

Source code in vllm/model_executor/models/transformers.py
def replace_linear_class(
    linear: nn.Linear, style: Literal["colwise", "rowwise"],
    quant_config: QuantizationConfig
) -> Union[ColumnParallelLinear, RowParallelLinear]:
    """
    Replace nn.Linear with one of vLLM's tensor parallel linear classes.

    Args:
        linear (nn.Linear): `nn.Linear` to be replaced.
        style (str): Tensor parallel style of the new linear, e.g. "colwise".
        quant_config (QuantConfig): Quantization config for the new linear.
    Returns:
        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
    """

    if not isinstance(style, str):
        raise ValueError(
            f"Unsupported parallel style type {type(style)}, expected str")

    vllm_linear_cls = {
        "colwise": ColumnParallelLinear,
        "rowwise": RowParallelLinear,
    }.get(style, ReplicatedLinear)

    return vllm_linear_cls(
        input_size=linear.in_features,
        output_size=linear.out_features,
        bias=linear.bias is not None,
        quant_config=quant_config,
        return_bias=False,
    )

vllm_flash_attention_forward

vllm_flash_attention_forward(
    module: Module,
    query: Tensor,
    key: Tensor,
    value: Tensor,
    attention_mask: Tensor,
    scaling: Optional[float] = None,
    attention_instances: Optional[dict[Attention]] = None,
    **kwargs,
)
Source code in vllm/model_executor/models/transformers.py
def vllm_flash_attention_forward(
        # Transformers args
        module: torch.nn.Module,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        attention_mask: torch.Tensor,
        # Transformers kwargs
        scaling: Optional[float] = None,
        # vLLM kwargs
        attention_instances: Optional[dict[Attention]] = None,
        **kwargs):
    self_attn = attention_instances[module.layer_idx]
    if scaling is not None:
        self_attn.impl.scale = float(scaling)
    hidden = query.shape[-2]
    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
    return self_attn.forward(query, key, value), None