Skip to content

vllm.transformers_utils.configs.kimi_k25

Kimi-K2.5 Model Configuration.

This configuration supports video-chunk as an internal modality type. A video-chunk is the smallest independently processable unit of video.

KimiK25Config

Bases: PretrainedConfig

Kimi-K2.5 model configuration.

Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks. A video-chunk consists of multiple consecutive frames that are processed together with temporal pooling.

Parameters:

Name Type Description Default
vision_config dict | KimiK25VisionConfig | None

Configuration for the vision tower and projector.

None
text_config dict | DeepseekV3Config | None

Configuration for the text model (DeepseekV3).

None
ignore_index int

The ignore index for the loss function.

-100
media_placeholder_token_id int

The token ID for media placeholders.

163605
pad_token_id int

The token ID for padding.

0
Source code in vllm/transformers_utils/configs/kimi_k25.py
class KimiK25Config(PretrainedConfig):
    """Kimi-K2.5 model configuration.

    Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
    A video-chunk consists of multiple consecutive frames
    that are processed together with temporal pooling.

    Args:
        vision_config: Configuration for the vision tower and projector.
        text_config: Configuration for the text model (DeepseekV3).
        ignore_index: The ignore index for the loss function.
        media_placeholder_token_id: The token ID for media placeholders.
        pad_token_id: The token ID for padding.
    """

    model_type = "kimi_k25"

    def __init__(
        self,
        vision_config: dict | KimiK25VisionConfig | None = None,
        text_config: dict | DeepseekV3Config | None = None,
        ignore_index: int = -100,
        media_placeholder_token_id: int = 163605,
        pad_token_id: int = 0,
        use_unified_vision_chunk: bool = False,
        video_placeholder: str = "<|kimi_k25_video_placeholder|>",
        **kwargs,
    ):
        # Vision config
        if vision_config is None:
            vision_config = KimiK25VisionConfig()
        elif isinstance(vision_config, dict):
            vision_config = KimiK25VisionConfig(**vision_config)
        self.vision_config: KimiK25VisionConfig = vision_config

        # Text config
        if text_config is None:
            text_config = DeepseekV3Config()
        elif isinstance(text_config, dict):
            text_config = DeepseekV3Config(**text_config)
        self.text_config: DeepseekV3Config = text_config

        # Set mm_hidden_size to text hidden size if not explicitly set
        if self.vision_config.mm_hidden_size == self.vision_config.hidden_size:
            self.vision_config.mm_hidden_size = self.text_config.hidden_size

        # Other config
        self.ignore_index = ignore_index
        self.media_placeholder_token_id = media_placeholder_token_id
        self.use_unified_vision_chunk = use_unified_vision_chunk
        self.video_placeholder = video_placeholder

        # Propagate quantization config from text model
        if getattr(self.text_config, "quantization_config", None) is not None:
            self.quantization_config = self.text_config.quantization_config

        super().__init__(pad_token_id=pad_token_id, **kwargs)

    @property
    def hidden_size(self) -> int:
        """Get hidden size from text config for compatibility."""
        return self.text_config.hidden_size

    @property
    def vocab_size(self) -> int:
        """Get vocab size from text config for compatibility."""
        return self.text_config.vocab_size

hidden_size property

hidden_size: int

Get hidden size from text config for compatibility.

vocab_size property

vocab_size: int

Get vocab size from text config for compatibility.