Bases: PretrainedConfig
Kimi-K2.5 model configuration.
Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks. A video-chunk consists of multiple consecutive frames that are processed together with temporal pooling.
Parameters:
| Name | Type | Description | Default |
vision_config | dict | KimiK25VisionConfig | None | Configuration for the vision tower and projector. | None |
text_config | dict | DeepseekV3Config | None | Configuration for the text model (DeepseekV3). | None |
ignore_index | int | The ignore index for the loss function. | -100 |
media_placeholder_token_id | int | The token ID for media placeholders. | 163605 |
pad_token_id | int | The token ID for padding. | 0 |
Source code in vllm/transformers_utils/configs/kimi_k25.py
| class KimiK25Config(PretrainedConfig):
"""Kimi-K2.5 model configuration.
Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
A video-chunk consists of multiple consecutive frames
that are processed together with temporal pooling.
Args:
vision_config: Configuration for the vision tower and projector.
text_config: Configuration for the text model (DeepseekV3).
ignore_index: The ignore index for the loss function.
media_placeholder_token_id: The token ID for media placeholders.
pad_token_id: The token ID for padding.
"""
model_type = "kimi_k25"
def __init__(
self,
vision_config: dict | KimiK25VisionConfig | None = None,
text_config: dict | DeepseekV3Config | None = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
use_unified_vision_chunk: bool = False,
video_placeholder: str = "<|kimi_k25_video_placeholder|>",
**kwargs,
):
# Vision config
if vision_config is None:
vision_config = KimiK25VisionConfig()
elif isinstance(vision_config, dict):
vision_config = KimiK25VisionConfig(**vision_config)
self.vision_config: KimiK25VisionConfig = vision_config
# Text config
if text_config is None:
text_config = DeepseekV3Config()
elif isinstance(text_config, dict):
text_config = DeepseekV3Config(**text_config)
self.text_config: DeepseekV3Config = text_config
# Set mm_hidden_size to text hidden size if not explicitly set
if self.vision_config.mm_hidden_size == self.vision_config.hidden_size:
self.vision_config.mm_hidden_size = self.text_config.hidden_size
# Other config
self.ignore_index = ignore_index
self.media_placeholder_token_id = media_placeholder_token_id
self.use_unified_vision_chunk = use_unified_vision_chunk
self.video_placeholder = video_placeholder
# Propagate quantization config from text model
if getattr(self.text_config, "quantization_config", None) is not None:
self.quantization_config = self.text_config.quantization_config
super().__init__(pad_token_id=pad_token_id, **kwargs)
@property
def hidden_size(self) -> int:
"""Get hidden size from text config for compatibility."""
return self.text_config.hidden_size
@property
def vocab_size(self) -> int:
"""Get vocab size from text config for compatibility."""
return self.text_config.vocab_size
|
Get hidden size from text config for compatibility.
Get vocab size from text config for compatibility.