vllm.model_executor.models.midashenglm ¶

Inference-only MiDashengLM model compatible with HuggingFace weights.

MiDashengLMAudioInputs ¶

Bases: TensorSchema

Dimensions

bn: Batch size * number of audios
p: Number of sampling points

Source code in vllm/model_executor/models/midashenglm.py

class MiDashengLMAudioInputs(TensorSchema):
    """

    Dimensions:
        - bn: Batch size * number of audios
        - p: Number of sampling points
    """

    input_values: Annotated[torch.Tensor, TensorShape("n", "p")]
    audio_length: Annotated[torch.Tensor, TensorShape("n")]

calculate_mel_frames_dasheng ¶

calculate_mel_frames_dasheng(
    audio_length_samples: int,
    n_fft: int = 512,
    hop_size: int = 160,
    dasheng_subsampling: int = 4,
    center=True,
    model_subsampling: int = 5,
) -> int

Calculate the number of Mel-spectrogram frames.

Source code in vllm/model_executor/models/midashenglm.py

def calculate_mel_frames_dasheng(
    audio_length_samples: int,
    n_fft: int = 512,
    hop_size: int = 160,
    dasheng_subsampling: int = 4,
    center=True,
    model_subsampling: int = 5,
) -> int:
    """Calculate the number of Mel-spectrogram frames."""
    if center:
        audio_length_samples = audio_length_samples + n_fft

    return (
        int(1 + ((audio_length_samples - n_fft) / hop_size))
        // dasheng_subsampling
        // model_subsampling
    )