Skip to content

vllm.model_executor.models.terratorch

Wrapper around Terratorch models

logger module-attribute

logger = init_logger(__name__)

Terratorch

Bases: Module, IsAttentionFree, SupportsMultiModal

Source code in vllm/model_executor/models/terratorch.py
@default_pooling_type("All")
@MULTIMODAL_REGISTRY.register_processor(
    TerratorchMultiModalProcessor,
    info=TerratorchProcessingInfo,
    dummy_inputs=TerratorchInputBuilder,
)
class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
    merge_by_field_config = True
    supports_multimodal_raw_input_only = True
    is_pooling_model = True

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
        if modality.startswith("image"):
            return None

        raise ValueError("Only image modality is supported")

    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]

        self.inference_runner = InferenceRunner(config)
        self.model = self.inference_runner.model

        pooler_config = vllm_config.model_config.pooler_config
        assert pooler_config is not None

        self.pooler = DispatchPooler({"plugin": DummyPooler()})

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: MultiModalEmbeddings | None = None,
        *,
        is_multimodal: torch.Tensor | None = None,
        handle_oov_mm_token: bool = False,
    ) -> torch.Tensor:
        # We do not really use any input tokens and therefore no embeddings
        # to be calculated. However, due to the mandatory token ids in
        # the input prompt we pass one token and the size of the dummy
        # embedding tensors must reflect that.
        return torch.empty((input_ids.shape[0], 0))

    def forward(
        self,
        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: object,
    ):
        model_output = self.inference_runner.forward(**kwargs)

        return model_output.output

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        params_list = []
        model_buffers = dict(self.named_buffers())
        loaded_buffers = []
        for key, value in weights:
            if isinstance(value, (dict, OrderedDict)):
                if key == "state_dict":
                    weights_to_parse = value
                    for name, weight in weights_to_parse.items():
                        name = f"inference_runner.{name}"

                        if "pos_embed" in name:
                            continue

                        if "_timm_module." in name:
                            name = name.replace("_timm_module.", "")

                        # this model requires a couple of buffers to be loaded
                        # that are not loadable with the AutoWeightsLoader
                        if name in model_buffers:
                            if "_timm_module." in name:
                                name = name.replace("_timm_module.", "")
                            buffer = model_buffers[name]
                            weight_loader = getattr(
                                buffer, "weight_loader", default_weight_loader
                            )
                            weight_loader(buffer, weight)
                            loaded_buffers.append(name)
                        else:
                            params_list.append((name, weight))
                    break

            elif isinstance(value, torch.Tensor):
                params_list.append((f"inference_runner.model.{key}", value))

        # Load the remaining model parameters
        loader = AutoWeightsLoader(self)
        autoloaded_weights = loader.load_weights(params_list)

        return autoloaded_weights.union(set(loaded_buffers))

inference_runner instance-attribute

inference_runner = InferenceRunner(config)

is_pooling_model class-attribute instance-attribute

is_pooling_model = True

merge_by_field_config class-attribute instance-attribute

merge_by_field_config = True

model instance-attribute

model = model

pooler instance-attribute

pooler = DispatchPooler({'plugin': DummyPooler()})

supports_multimodal_raw_input_only class-attribute instance-attribute

supports_multimodal_raw_input_only = True

__init__

__init__(vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/terratorch.py
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]

    self.inference_runner = InferenceRunner(config)
    self.model = self.inference_runner.model

    pooler_config = vllm_config.model_config.pooler_config
    assert pooler_config is not None

    self.pooler = DispatchPooler({"plugin": DummyPooler()})

forward

forward(
    input_ids: Tensor | None,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: object,
)
Source code in vllm/model_executor/models/terratorch.py
def forward(
    self,
    input_ids: torch.Tensor | None,
    positions: torch.Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: torch.Tensor | None = None,
    **kwargs: object,
):
    model_output = self.inference_runner.forward(**kwargs)

    return model_output.output

get_input_embeddings

get_input_embeddings(
    input_ids: Tensor,
    multimodal_embeddings: MultiModalEmbeddings
    | None = None,
    *,
    is_multimodal: Tensor | None = None,
    handle_oov_mm_token: bool = False,
) -> Tensor
Source code in vllm/model_executor/models/terratorch.py
def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings: MultiModalEmbeddings | None = None,
    *,
    is_multimodal: torch.Tensor | None = None,
    handle_oov_mm_token: bool = False,
) -> torch.Tensor:
    # We do not really use any input tokens and therefore no embeddings
    # to be calculated. However, due to the mandatory token ids in
    # the input prompt we pass one token and the size of the dummy
    # embedding tensors must reflect that.
    return torch.empty((input_ids.shape[0], 0))

get_placeholder_str classmethod

get_placeholder_str(modality: str, i: int) -> str | None
Source code in vllm/model_executor/models/terratorch.py
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
    if modality.startswith("image"):
        return None

    raise ValueError("Only image modality is supported")

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]
Source code in vllm/model_executor/models/terratorch.py
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
    params_list = []
    model_buffers = dict(self.named_buffers())
    loaded_buffers = []
    for key, value in weights:
        if isinstance(value, (dict, OrderedDict)):
            if key == "state_dict":
                weights_to_parse = value
                for name, weight in weights_to_parse.items():
                    name = f"inference_runner.{name}"

                    if "pos_embed" in name:
                        continue

                    if "_timm_module." in name:
                        name = name.replace("_timm_module.", "")

                    # this model requires a couple of buffers to be loaded
                    # that are not loadable with the AutoWeightsLoader
                    if name in model_buffers:
                        if "_timm_module." in name:
                            name = name.replace("_timm_module.", "")
                        buffer = model_buffers[name]
                        weight_loader = getattr(
                            buffer, "weight_loader", default_weight_loader
                        )
                        weight_loader(buffer, weight)
                        loaded_buffers.append(name)
                    else:
                        params_list.append((name, weight))
                break

        elif isinstance(value, torch.Tensor):
            params_list.append((f"inference_runner.model.{key}", value))

    # Load the remaining model parameters
    loader = AutoWeightsLoader(self)
    autoloaded_weights = loader.load_weights(params_list)

    return autoloaded_weights.union(set(loaded_buffers))

TerratorchInputBuilder

Bases: BaseDummyInputsBuilder[TerratorchProcessingInfo]

Source code in vllm/model_executor/models/terratorch.py
class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
    def __init__(self, info: TerratorchProcessingInfo):
        super().__init__(info)
        self.dummy_data_generator = DummyDataGenerator(
            self.info.get_hf_config().to_dict()["pretrained_cfg"]
        )

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        return ""

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Mapping[str, BaseDummyOptions] | None = None,
    ) -> MultiModalDataDict:
        # Dummy data is generated based on the 'input' section
        # defined in the HF configuration file

        if mm_options:
            logger.warning(
                "Configurable multimodal profiling "
                "options are not supported for Terratorch. "
                "They are ignored for now."
            )

        return self.dummy_data_generator.get_dummy_mm_data()

dummy_data_generator instance-attribute

dummy_data_generator = DummyDataGenerator(
    to_dict()["pretrained_cfg"]
)

__init__

__init__(info: TerratorchProcessingInfo)
Source code in vllm/model_executor/models/terratorch.py
def __init__(self, info: TerratorchProcessingInfo):
    super().__init__(info)
    self.dummy_data_generator = DummyDataGenerator(
        self.info.get_hf_config().to_dict()["pretrained_cfg"]
    )

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> MultiModalDataDict
Source code in vllm/model_executor/models/terratorch.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
    # Dummy data is generated based on the 'input' section
    # defined in the HF configuration file

    if mm_options:
        logger.warning(
            "Configurable multimodal profiling "
            "options are not supported for Terratorch. "
            "They are ignored for now."
        )

    return self.dummy_data_generator.get_dummy_mm_data()

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/terratorch.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    return ""

TerratorchMultiModalDataParser

Bases: MultiModalDataParser

Source code in vllm/model_executor/models/terratorch.py
class TerratorchMultiModalDataParser(MultiModalDataParser):
    def __init__(self, pretrained_cfg: dict, *args, **kwargs):
        self._pretrained_cfg = pretrained_cfg
        super().__init__(*args, **kwargs)

    def _parse_image_data(
        self,
        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
    ) -> ModalityDataItems[Any, Any] | None:
        if isinstance(data, dict):
            terratorch_fields = _terratorch_field_names(self._pretrained_cfg)

            return DictEmbeddingItems(
                data,
                modality="image",
                required_fields=terratorch_fields,
                fields_factory=_terratorch_field_factory(self._pretrained_cfg),
            )

        return super()._parse_image_data(data)

_pretrained_cfg instance-attribute

_pretrained_cfg = pretrained_cfg

__init__

__init__(pretrained_cfg: dict, *args, **kwargs)
Source code in vllm/model_executor/models/terratorch.py
def __init__(self, pretrained_cfg: dict, *args, **kwargs):
    self._pretrained_cfg = pretrained_cfg
    super().__init__(*args, **kwargs)

_parse_image_data

_parse_image_data(
    data: dict[str, Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any] | None
Source code in vllm/model_executor/models/terratorch.py
def _parse_image_data(
    self,
    data: dict[str, torch.Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any] | None:
    if isinstance(data, dict):
        terratorch_fields = _terratorch_field_names(self._pretrained_cfg)

        return DictEmbeddingItems(
            data,
            modality="image",
            required_fields=terratorch_fields,
            fields_factory=_terratorch_field_factory(self._pretrained_cfg),
        )

    return super()._parse_image_data(data)

TerratorchMultiModalProcessor

Bases: BaseMultiModalProcessor

Source code in vllm/model_executor/models/terratorch.py
class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
    def __init__(
        self,
        info: TerratorchProcessingInfo,
        dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
        *,
        cache: MultiModalProcessorOnlyCache | None = None,
    ) -> None:
        self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
        super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)

    def _get_data_parser(self) -> MultiModalDataParser:
        return TerratorchMultiModalDataParser(pretrained_cfg=self.pretrained_cfg)

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return _terratorch_field_factory(self.pretrained_cfg)(hf_inputs)

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> Sequence[PromptUpdate]:
        return []

    def apply(
        self,
        prompt: str | list[int],
        mm_data: MultiModalDataDict,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object] | None = None,
        mm_uuids: MultiModalUUIDDict | None = None,
    ) -> MultiModalInputs:
        if "image" in mm_data:
            image_data = mm_data["image"]
            image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}
        else:
            image_data = mm_data
            image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}

        mm_data = {"image": image_data}

        mm_items = self._to_mm_items(mm_data)
        tokenization_kwargs = tokenization_kwargs or {}
        mm_hashes = self._hash_mm_items(
            mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
        )
        mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}

        mm_processed_data = BatchFeature(image_data)

        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
            mm_processed_data,
            self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs),
        )

        return MultiModalInputs(
            type="multimodal",
            prompt_token_ids=[1],
            mm_kwargs=mm_kwargs,
            mm_hashes=mm_hashes,
            mm_placeholders=mm_placeholders,
        )

pretrained_cfg instance-attribute

pretrained_cfg = to_dict()['pretrained_cfg']

__init__

__init__(
    info: TerratorchProcessingInfo,
    dummy_inputs: BaseDummyInputsBuilder[
        TerratorchProcessingInfo
    ],
    *,
    cache: MultiModalProcessorOnlyCache | None = None,
) -> None
Source code in vllm/model_executor/models/terratorch.py
def __init__(
    self,
    info: TerratorchProcessingInfo,
    dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
    *,
    cache: MultiModalProcessorOnlyCache | None = None,
) -> None:
    self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
    super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)

_get_data_parser

_get_data_parser() -> MultiModalDataParser
Source code in vllm/model_executor/models/terratorch.py
def _get_data_parser(self) -> MultiModalDataParser:
    return TerratorchMultiModalDataParser(pretrained_cfg=self.pretrained_cfg)

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]
Source code in vllm/model_executor/models/terratorch.py
def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    return _terratorch_field_factory(self.pretrained_cfg)(hf_inputs)

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]
Source code in vllm/model_executor/models/terratorch.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
    return []

apply

apply(
    prompt: str | list[int],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object] | None = None,
    mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs
Source code in vllm/model_executor/models/terratorch.py
def apply(
    self,
    prompt: str | list[int],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object] | None = None,
    mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
    if "image" in mm_data:
        image_data = mm_data["image"]
        image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}
    else:
        image_data = mm_data
        image_data = {k: v.unsqueeze(0) for k, v in image_data.items()}

    mm_data = {"image": image_data}

    mm_items = self._to_mm_items(mm_data)
    tokenization_kwargs = tokenization_kwargs or {}
    mm_hashes = self._hash_mm_items(
        mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
    )
    mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}

    mm_processed_data = BatchFeature(image_data)

    mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
        mm_processed_data,
        self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs),
    )

    return MultiModalInputs(
        type="multimodal",
        prompt_token_ids=[1],
        mm_kwargs=mm_kwargs,
        mm_hashes=mm_hashes,
        mm_placeholders=mm_placeholders,
    )

TerratorchProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/terratorch.py
class TerratorchProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None}

get_supported_mm_limits

get_supported_mm_limits() -> Mapping[str, int | None]
Source code in vllm/model_executor/models/terratorch.py
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
    return {"image": None}

_terratorch_field_factory

_terratorch_field_factory(
    pretrained_cfg: dict,
) -> Callable[
    [Mapping[str, Tensor]],
    Mapping[str, MultiModalFieldConfig],
]
Source code in vllm/model_executor/models/terratorch.py
def _terratorch_field_factory(
    pretrained_cfg: dict,
) -> Callable[
    [Mapping[str, torch.Tensor]],
    Mapping[str, MultiModalFieldConfig],
]:
    def _terratorch_field_config(hf_inputs: Mapping[str, torch.Tensor]):
        input_definition = InputDefinition(**pretrained_cfg["input"])
        fields = {}
        for input_name, input in input_definition.data.items():
            if input.type == InputTypeEnum.tensor:
                fields[input_name] = "image"

        return {
            field_name: MultiModalFieldConfig.batched(modality=field_modality)
            for field_name, field_modality in fields.items()
        }

    return _terratorch_field_config

_terratorch_field_names

_terratorch_field_names(pretrained_cfg: dict)
Source code in vllm/model_executor/models/terratorch.py
def _terratorch_field_names(pretrained_cfg: dict):
    input_definition = InputDefinition(**pretrained_cfg["input"])
    return set(input_definition.data.keys())