Skip to content

vllm.model_executor.models.glmasr_utils

_calculate_conv_output_length

_calculate_conv_output_length(
    input_length: Tensor,
    padding: int,
    kernel_size: int,
    stride: int,
) -> Tensor

Calculate Conv1d output length using standard formula.

Source code in vllm/model_executor/models/glmasr_utils.py
def _calculate_conv_output_length(
    input_length: torch.Tensor, padding: int, kernel_size: int, stride: int
) -> torch.Tensor:
    """Calculate Conv1d output length using standard formula."""
    # in sync with `hf_processor._get_audio_token_length`
    return (input_length + 2 * padding - (kernel_size - 1) - 1) // stride + 1

_extract_mask_for_item

_extract_mask_for_item(
    feature_attention_mask: Tensor | list[Tensor],
    chunk_counts: Tensor | list[int] | None,
    item_idx: int,
) -> Tensor

Extract attention mask for a specific audio item.

Source code in vllm/model_executor/models/glmasr_utils.py
def _extract_mask_for_item(
    feature_attention_mask: torch.Tensor | list[torch.Tensor],
    chunk_counts: torch.Tensor | list[int] | None,
    item_idx: int,
) -> torch.Tensor:
    """Extract attention mask for a specific audio item."""
    if chunk_counts is None:
        # Single item per audio
        mask = feature_attention_mask[item_idx]
        if isinstance(feature_attention_mask, torch.Tensor):
            return mask.unsqueeze(0)
        return _normalize_to_tensor(mask)

    # Multiple chunks per audio: calculate slice indices
    counts = _as_list_chunk_counts(chunk_counts)
    start_idx = sum(counts[:item_idx])
    end_idx = start_idx + counts[item_idx]

    # Extract slice
    if isinstance(feature_attention_mask, torch.Tensor):
        return feature_attention_mask[start_idx:end_idx]
    mask_slice = feature_attention_mask[start_idx:end_idx]
    return _normalize_to_tensor(mask_slice)

_get_audio_output_lengths_for_tower

_get_audio_output_lengths_for_tower(
    audio_tower: Module,
    audio_lengths: Tensor,
    merge_factor: int,
    conv_params: list[tuple[int, int, int]],
) -> Tensor

Calculate the output lengths after audio processing.

The output length accounts for: 1. Convolution layers (downsampling) 2. Merge factor (further downsampling during projection)

Parameters:

Name Type Description Default
audio_tower Module

The audio encoder module

required
audio_lengths Tensor

Input feature lengths [batch_size]

required
merge_factor int

Factor for merging adjacent features

required
conv_params list[tuple[int, int, int]]

List of (padding, kernel_size, stride) for each conv layer

required

Returns:

Type Description
Tensor

Output lengths after all processing [batch_size]

Source code in vllm/model_executor/models/glmasr_utils.py
def _get_audio_output_lengths_for_tower(
    audio_tower: nn.Module,
    audio_lengths: torch.Tensor,
    merge_factor: int,
    conv_params: list[tuple[int, int, int]],
) -> torch.Tensor:
    """
    Calculate the output lengths after audio processing.

    The output length accounts for:
    1. Convolution layers (downsampling)
    2. Merge factor (further downsampling during projection)

    Args:
        audio_tower: The audio encoder module
        audio_lengths: Input feature lengths [batch_size]
        merge_factor: Factor for merging adjacent features
        conv_params: List of (padding, kernel_size, stride) for each conv layer

    Returns:
        Output lengths after all processing [batch_size]
    """
    # First, calculate the output length after convolutions
    if hasattr(audio_tower, "_get_feat_extract_output_lengths"):
        _, conv_output_lengths = audio_tower._get_feat_extract_output_lengths(
            audio_lengths
        )
    else:
        conv_output_lengths = audio_lengths
        for padding, kernel_size, stride in conv_params:
            conv_output_lengths = _calculate_conv_output_length(
                conv_output_lengths, padding, kernel_size, stride
            )

    # Then, apply merge_factor to get final output length
    # Formula: (conv_output_lengths - merge_factor) // merge_factor + 1
    return (conv_output_lengths - merge_factor) // merge_factor + 1

_normalize_to_tensor

_normalize_to_tensor(mask: Tensor | list[Tensor]) -> Tensor

Convert mask to tensor, handling both list and tensor formats.

Source code in vllm/model_executor/models/glmasr_utils.py
def _normalize_to_tensor(mask: torch.Tensor | list[torch.Tensor]) -> torch.Tensor:
    """Convert mask to tensor, handling both list and tensor formats."""
    if isinstance(mask, list):
        return (
            torch.stack(mask)
            if mask and isinstance(mask[0], torch.Tensor)
            else torch.tensor(mask)
        )
    return mask