Skip to content

vllm.multimodal.profiling

_I module-attribute

_I = TypeVar('_I', bound=BaseProcessingInfo)

logger module-attribute

logger = init_logger(__name__)

BaseDummyInputsBuilder

Bases: ABC, Generic[_I]

Abstract base class that constructs the dummy data to profile multi-modal models.

Source code in vllm/multimodal/profiling.py
class BaseDummyInputsBuilder(ABC, Generic[_I]):
    """
    Abstract base class that constructs the dummy data to profile
    multi-modal models.
    """

    def __init__(self, info: _I) -> None:
        super().__init__()

        self.info = info

    @abstractmethod
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        """
        Build the text input corresponding to `mm_counts`.
        """
        raise NotImplementedError

    @abstractmethod
    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        """
        Build the multimodal input which, after processing, results in
        the maximum possible number of placeholder tokens.
        """
        raise NotImplementedError

    def get_dummy_processor_inputs(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> ProcessorInputs:
        """
        Build the input which, after processing, results in
        the maximum possible number of placeholder tokens.
        """
        dummy_text = self.get_dummy_text(mm_counts)
        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)

        return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data)

    def _get_dummy_audios(
        self,
        *,
        length: int,
        num_audios: int,
    ) -> list[npt.NDArray]:
        if num_audios == 0:
            return []
        audio = np.zeros((length, ))
        return [audio] * num_audios

    def _get_dummy_images(
        self,
        *,
        width: int,
        height: int,
        num_images: int,
    ) -> list[Image.Image]:
        if num_images == 0:
            return []
        image = Image.new("RGB", (width, height), color=255)
        return [image] * num_images

    def _get_dummy_videos(
        self,
        *,
        width: int,
        height: int,
        num_frames: int,
        num_videos: int,
    ) -> list[npt.NDArray]:
        if num_videos == 0:
            return []
        video = np.full((num_frames, width, height, 3), 255)
        return [video] * num_videos

info instance-attribute

info = info

__init__

__init__(info: _I) -> None
Source code in vllm/multimodal/profiling.py
def __init__(self, info: _I) -> None:
    super().__init__()

    self.info = info

_get_dummy_audios

_get_dummy_audios(
    *, length: int, num_audios: int
) -> list[NDArray]
Source code in vllm/multimodal/profiling.py
def _get_dummy_audios(
    self,
    *,
    length: int,
    num_audios: int,
) -> list[npt.NDArray]:
    if num_audios == 0:
        return []
    audio = np.zeros((length, ))
    return [audio] * num_audios

_get_dummy_images

_get_dummy_images(
    *, width: int, height: int, num_images: int
) -> list[Image]
Source code in vllm/multimodal/profiling.py
def _get_dummy_images(
    self,
    *,
    width: int,
    height: int,
    num_images: int,
) -> list[Image.Image]:
    if num_images == 0:
        return []
    image = Image.new("RGB", (width, height), color=255)
    return [image] * num_images

_get_dummy_videos

_get_dummy_videos(
    *,
    width: int,
    height: int,
    num_frames: int,
    num_videos: int,
) -> list[NDArray]
Source code in vllm/multimodal/profiling.py
def _get_dummy_videos(
    self,
    *,
    width: int,
    height: int,
    num_frames: int,
    num_videos: int,
) -> list[npt.NDArray]:
    if num_videos == 0:
        return []
    video = np.full((num_frames, width, height, 3), 255)
    return [video] * num_videos

get_dummy_mm_data abstractmethod

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict

Build the multimodal input which, after processing, results in the maximum possible number of placeholder tokens.

Source code in vllm/multimodal/profiling.py
@abstractmethod
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    """
    Build the multimodal input which, after processing, results in
    the maximum possible number of placeholder tokens.
    """
    raise NotImplementedError

get_dummy_processor_inputs

get_dummy_processor_inputs(
    seq_len: int, mm_counts: Mapping[str, int]
) -> ProcessorInputs

Build the input which, after processing, results in the maximum possible number of placeholder tokens.

Source code in vllm/multimodal/profiling.py
def get_dummy_processor_inputs(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> ProcessorInputs:
    """
    Build the input which, after processing, results in
    the maximum possible number of placeholder tokens.
    """
    dummy_text = self.get_dummy_text(mm_counts)
    dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)

    return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data)

get_dummy_text abstractmethod

get_dummy_text(mm_counts: Mapping[str, int]) -> str

Build the text input corresponding to mm_counts.

Source code in vllm/multimodal/profiling.py
@abstractmethod
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    """
    Build the text input corresponding to `mm_counts`.
    """
    raise NotImplementedError

DummyDecoderData

Bases: NamedTuple

Dummy data used for profiling.

Source code in vllm/multimodal/profiling.py
class DummyDecoderData(NamedTuple):
    """Dummy data used for profiling."""

    prompt_token_ids: list[int]
    multi_modal_data: MultiModalKwargs
    multi_modal_placeholders: MultiModalPlaceholderDict

multi_modal_data instance-attribute

multi_modal_data: MultiModalKwargs

multi_modal_placeholders instance-attribute

multi_modal_placeholders: MultiModalPlaceholderDict

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

DummyEncoderData

Bases: NamedTuple

Dummy data used for profiling.

Source code in vllm/multimodal/profiling.py
class DummyEncoderData(NamedTuple):
    """Dummy data used for profiling."""

    prompt_token_ids: list[int]

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

MultiModalProfiler

Bases: Generic[_I]

Contains code for running memory profiling for multi-modal models.

Source code in vllm/multimodal/profiling.py
class MultiModalProfiler(Generic[_I]):
    """
    Contains code for running memory profiling for multi-modal models.
    """

    def __init__(
        self,
        processor: BaseMultiModalProcessor[_I],
    ) -> None:
        super().__init__()

        self.processor = processor

    @property
    def processing_info(self) -> BaseProcessingInfo:
        return self.processor.info

    @property
    def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
        return self.processor.dummy_inputs

    def get_mm_limits(self) -> Mapping[str, int]:
        return self.processing_info.get_allowed_mm_limits()

    def _get_dummy_mm_inputs(
        self,
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> MultiModalInputs:
        if mm_counts is None:
            mm_counts = self.get_mm_limits()

        factory = self.dummy_inputs
        processor_inputs = factory.get_dummy_processor_inputs(
            seq_len, mm_counts)

        return self.processor.apply(
            prompt=processor_inputs.prompt,
            mm_data=processor_inputs.mm_data,
            hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
        )

    def _get_mm_num_tokens(
        self,
        mm_inputs: MultiModalInputs,
    ) -> Mapping[str, int]:
        placeholders_by_modality = mm_inputs["mm_placeholders"]

        return {
            modality: sum(item.get_num_embeds() for item in placeholders)
            for modality, placeholders in placeholders_by_modality.items()
        }

    def get_encoder_dummy_data(
        self,
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> DummyEncoderData:
        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
        mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)

        # For encoder-decoder models, use encoder prompt token ids instead of
        # decoder prompt to construct dummy seq_data for encoder profiling.
        encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]

        total_len = len(encoder_prompt_token_ids)

        processor = cast(EncDecMultiModalProcessor, self.processor)
        if processor.pad_dummy_encoder_prompt:
            num_tokens_to_pad = max(total_len, seq_len) - total_len
            encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
        # NOTE: Whisper allows total_len > seq_len.
        elif total_len > seq_len and not envs.VLLM_USE_V1:
            # `max_num_batched_tokens` is defined by `SchedulerConfig`
            logger.warning_once(
                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
                seq_len,
                total_len,
                str(self._get_mm_num_tokens(mm_inputs)),
            )

        return DummyEncoderData(encoder_prompt_token_ids)

    def get_decoder_dummy_data(
        self,
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> DummyDecoderData:
        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

        prompt_token_ids = mm_inputs["prompt_token_ids"]
        total_len = len(prompt_token_ids)

        # V0 does not support chunked prefill.
        if total_len > seq_len and not envs.VLLM_USE_V1:
            # `max_num_batched_tokens` is defined by `SchedulerConfig`
            logger.warning_once(
                "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
                seq_len,
                total_len,
                str(self._get_mm_num_tokens(mm_inputs)),
            )

        if total_len < seq_len:
            prompt_token_ids.extend([0] * (seq_len - total_len))

        return DummyDecoderData(
            prompt_token_ids=prompt_token_ids,
            multi_modal_data=mm_inputs["mm_kwargs"],
            multi_modal_placeholders=mm_inputs["mm_placeholders"],
        )

    def get_mm_max_tokens(
        self,
        seq_len: int,
        mm_counts: Optional[Mapping[str, int]] = None,
    ) -> Mapping[str, int]:
        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

        return self._get_mm_num_tokens(mm_inputs)

dummy_inputs property

dummy_inputs: BaseDummyInputsBuilder[_I]

processing_info property

processing_info: BaseProcessingInfo

processor instance-attribute

processor = processor

__init__

__init__(processor: BaseMultiModalProcessor[_I]) -> None
Source code in vllm/multimodal/profiling.py
def __init__(
    self,
    processor: BaseMultiModalProcessor[_I],
) -> None:
    super().__init__()

    self.processor = processor

_get_dummy_mm_inputs

_get_dummy_mm_inputs(
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> MultiModalInputs
Source code in vllm/multimodal/profiling.py
def _get_dummy_mm_inputs(
    self,
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> MultiModalInputs:
    if mm_counts is None:
        mm_counts = self.get_mm_limits()

    factory = self.dummy_inputs
    processor_inputs = factory.get_dummy_processor_inputs(
        seq_len, mm_counts)

    return self.processor.apply(
        prompt=processor_inputs.prompt,
        mm_data=processor_inputs.mm_data,
        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
    )

_get_mm_num_tokens

_get_mm_num_tokens(
    mm_inputs: MultiModalInputs,
) -> Mapping[str, int]
Source code in vllm/multimodal/profiling.py
def _get_mm_num_tokens(
    self,
    mm_inputs: MultiModalInputs,
) -> Mapping[str, int]:
    placeholders_by_modality = mm_inputs["mm_placeholders"]

    return {
        modality: sum(item.get_num_embeds() for item in placeholders)
        for modality, placeholders in placeholders_by_modality.items()
    }

get_decoder_dummy_data

get_decoder_dummy_data(
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> DummyDecoderData
Source code in vllm/multimodal/profiling.py
def get_decoder_dummy_data(
    self,
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> DummyDecoderData:
    mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

    prompt_token_ids = mm_inputs["prompt_token_ids"]
    total_len = len(prompt_token_ids)

    # V0 does not support chunked prefill.
    if total_len > seq_len and not envs.VLLM_USE_V1:
        # `max_num_batched_tokens` is defined by `SchedulerConfig`
        logger.warning_once(
            "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
            "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
            "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
            "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
            seq_len,
            total_len,
            str(self._get_mm_num_tokens(mm_inputs)),
        )

    if total_len < seq_len:
        prompt_token_ids.extend([0] * (seq_len - total_len))

    return DummyDecoderData(
        prompt_token_ids=prompt_token_ids,
        multi_modal_data=mm_inputs["mm_kwargs"],
        multi_modal_placeholders=mm_inputs["mm_placeholders"],
    )

get_encoder_dummy_data

get_encoder_dummy_data(
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> DummyEncoderData
Source code in vllm/multimodal/profiling.py
def get_encoder_dummy_data(
    self,
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> DummyEncoderData:
    mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
    mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)

    # For encoder-decoder models, use encoder prompt token ids instead of
    # decoder prompt to construct dummy seq_data for encoder profiling.
    encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]

    total_len = len(encoder_prompt_token_ids)

    processor = cast(EncDecMultiModalProcessor, self.processor)
    if processor.pad_dummy_encoder_prompt:
        num_tokens_to_pad = max(total_len, seq_len) - total_len
        encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
    # NOTE: Whisper allows total_len > seq_len.
    elif total_len > seq_len and not envs.VLLM_USE_V1:
        # `max_num_batched_tokens` is defined by `SchedulerConfig`
        logger.warning_once(
            "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
            "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
            "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
            "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
            seq_len,
            total_len,
            str(self._get_mm_num_tokens(mm_inputs)),
        )

    return DummyEncoderData(encoder_prompt_token_ids)

get_mm_limits

get_mm_limits() -> Mapping[str, int]
Source code in vllm/multimodal/profiling.py
def get_mm_limits(self) -> Mapping[str, int]:
    return self.processing_info.get_allowed_mm_limits()

get_mm_max_tokens

get_mm_max_tokens(
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> Mapping[str, int]
Source code in vllm/multimodal/profiling.py
def get_mm_max_tokens(
    self,
    seq_len: int,
    mm_counts: Optional[Mapping[str, int]] = None,
) -> Mapping[str, int]:
    mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

    return self._get_mm_num_tokens(mm_inputs)

ProcessorInputs dataclass

Represents the keyword arguments to vllm.multimodal.processing.BaseMultiModalProcessor.apply.

Source code in vllm/multimodal/profiling.py
@dataclass
class ProcessorInputs:
    """
    Represents the keyword arguments to
    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
    """
    prompt: Union[str, list[int]]
    mm_data: MultiModalDataDict
    hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)

hf_processor_mm_kwargs class-attribute instance-attribute

hf_processor_mm_kwargs: Mapping[str, object] = field(
    default_factory=dict
)

mm_data instance-attribute

prompt instance-attribute

prompt: Union[str, list[int]]

__init__

__init__(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object] = dict(),
) -> None