`vllm.multimodal` ¶

Modules:

audio –
cache –
encoder_budget –
evs –
hasher –
image –
inputs –
media –
parse –
processing –
registry –
utils –
video –

Classes:

MultiModalKwargsItems –

A dictionary of processed multi-modal inputs by modality.
MultiModalRegistry –

A registry that dispatches data processing according to the model.

Attributes:

BatchedTensorInputs (TypeAlias) –

A dictionary containing nested tensors which have been batched via
MULTIMODAL_REGISTRY –

The global MultiModalRegistry
NestedTensors (TypeAlias) –

Uses a list instead of a tensor if the dimensions of each element do not match.

`BatchedTensorInputs = dict[str, NestedTensors]` `module-attribute` ¶

A dictionary containing nested tensors which have been batched via MultiModalKwargsItems.get_data.

`MULTIMODAL_REGISTRY = MultiModalRegistry()` `module-attribute` ¶

The global MultiModalRegistry is used by model runners to dispatch data processing according to the target model.

Info

mm_processing

`NestedTensors = Union[list['NestedTensors'], list['torch.Tensor'], 'torch.Tensor', tuple['torch.Tensor', ...]]` `module-attribute` ¶

Uses a list instead of a tensor if the dimensions of each element do not match.

`MultiModalKwargsItems` ¶

Bases: UserDict[str, Sequence[_I]]

A dictionary of processed multi-modal inputs by modality.

For example, given a processor that processes images into pixel_values and image_grid_thw, and audios into input_audio_features, a prompt with 2 images and 1 audio will be processed into a MultiModalKwargsItems with the following structure:

MultiModalKwargsItems(
    {
        "image": [
            # For the first image
            MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
            # For the second imgae
            MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
        ],
        "audio": [
            # For the first audio
            MultiModalKwargsItem({"input_audio_features": ...}),
        ],
    }
)

Unlike HF processing which returns all items in a single dictionary with batched keyword arguments, we split up the items because some of them may already be cached. Also, items from multiple requests may be batched together to improve throughput, using the logic defined by the BaseMultiModalField for each keyword argument.

Methods:

get_data –

Construct a dictionary of keyword arguments to pass to the model.

Source code in vllm/multimodal/inputs.py

class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
    """
    A dictionary of processed multi-modal inputs by modality.

    For example, given a processor that processes
    images into `pixel_values` and `image_grid_thw`,
    and audios into `input_audio_features`,
    a prompt with 2 images and 1 audio will be processed
    into a `MultiModalKwargsItems` with the following structure:

    ```python
    MultiModalKwargsItems(
        {
            "image": [
                # For the first image
                MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
                # For the second imgae
                MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
            ],
            "audio": [
                # For the first audio
                MultiModalKwargsItem({"input_audio_features": ...}),
            ],
        }
    )
    ```

    Unlike HF processing which returns all items
    in a single dictionary with batched keyword arguments,
    we split up the items because some of them may already be cached.
    Also, items from multiple requests may be batched together to improve throughput,
    using the logic defined by the
    [`BaseMultiModalField`][vllm.multimodal.inputs.BaseMultiModalField]
    for each keyword argument.
    """

    @staticmethod
    def from_hf_inputs(
        hf_inputs: "BatchFeature",
        config_by_key: Mapping[str, MultiModalFieldConfig],
    ):
        # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key`
        # We assume that those fields are not used in vLLM
        elems_by_key = dict[str, Sequence[MultiModalFieldElem]]()
        keys_by_modality = defaultdict[str, set[str]](set)
        for key, config in config_by_key.items():
            batch = hf_inputs.get(key)
            if batch is not None:
                elems = config.build_elems(key, batch)
                if len(elems) > 0:
                    elems_by_key[key] = elems
                    keys_by_modality[config.modality].add(key)

        items_by_modality = dict[str, list[MultiModalKwargsItem]]()
        for modality, keys in keys_by_modality.items():
            elems_in_modality = {k: elems_by_key[k] for k in keys}
            batch_sizes = {k: len(v) for k, v in elems_in_modality.items()}

            if len(set(batch_sizes.values())) > 1:
                raise ValueError(
                    f"Cannot merge different batch sizes for {modality=}! "
                    f"Found: {batch_sizes=}"
                )

            batch_size = next(iter(batch_sizes.values()))
            items_by_modality[modality] = [
                MultiModalKwargsItem({k: v[i] for k, v in elems_in_modality.items()})
                for i in range(batch_size)
            ]

        return MultiModalKwargsItems(items_by_modality)

    def __getitem__(self, modality: str) -> Sequence[_I]:
        if modality not in self:
            raise KeyError(
                f"Modality {modality!r} not found. "
                f"Available modalities: {set(self.keys())}"
            )

        return super().__getitem__(modality)  # type: ignore[return-value]

    def require_data(self) -> "MultiModalKwargsItems[MultiModalKwargsItem]":
        for modality, items in self.items():
            for i, item in enumerate(items):
                if item is None:
                    raise RuntimeError(f"Found empty mm_items[{modality}][{i}]")

        return self  # type: ignore[return-value]

    def get_data(
        self,
        *,
        device: torch.types.Device = None,
        pin_memory: bool = False,
    ) -> BatchedTensorInputs:
        """Construct a dictionary of keyword arguments to pass to the model."""
        from .utils import group_and_batch_mm_items

        items_by_modality = self.require_data()
        batches_by_modality = {
            modality: [
                data
                for _, data in group_and_batch_mm_items(
                    items,
                    device=device,
                    pin_memory=pin_memory,
                )
            ]
            for modality, items in items_by_modality.items()
            if len(items) > 0
        }

        out_data: BatchedTensorInputs = {}
        for _, batches in batches_by_modality.items():
            if len(batches) != 1:
                num_batches_by_modality = {
                    modality: len(batches)
                    for modality, batches in batches_by_modality.items()
                }

                raise RuntimeError(
                    f"Some modalities cannot be merged into a single batch "
                    f"({num_batches_by_modality=})"
                )

            out_data.update(batches[0])

        return out_data

`get_data(*, device=None, pin_memory=False)` ¶

Construct a dictionary of keyword arguments to pass to the model.

Source code in vllm/multimodal/inputs.py

def get_data(
    self,
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
) -> BatchedTensorInputs:
    """Construct a dictionary of keyword arguments to pass to the model."""
    from .utils import group_and_batch_mm_items

    items_by_modality = self.require_data()
    batches_by_modality = {
        modality: [
            data
            for _, data in group_and_batch_mm_items(
                items,
                device=device,
                pin_memory=pin_memory,
            )
        ]
        for modality, items in items_by_modality.items()
        if len(items) > 0
    }

    out_data: BatchedTensorInputs = {}
    for _, batches in batches_by_modality.items():
        if len(batches) != 1:
            num_batches_by_modality = {
                modality: len(batches)
                for modality, batches in batches_by_modality.items()
            }

            raise RuntimeError(
                f"Some modalities cannot be merged into a single batch "
                f"({num_batches_by_modality=})"
            )

        out_data.update(batches[0])

    return out_data