Skip to content

vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe

_supports_activation

_supports_activation(activation: MoEActivation) -> bool

Supports silu activation only.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_activation(activation: MoEActivation) -> bool:
    """Supports silu activation only."""
    return activation == MoEActivation.SILU

_supports_current_device

_supports_current_device() -> bool

Supports only Blackwell-family GPUs.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_current_device() -> bool:
    """Supports only Blackwell-family GPUs."""
    p = current_platform
    return p.is_cuda() and p.is_device_capability_family(100)

_supports_no_act_and_mul

_supports_no_act_and_mul() -> bool

Does not support non-gated MoE (i.e. Nanotron-Mini).

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_no_act_and_mul() -> bool:
    """Does not support non-gated MoE (i.e. Nanotron-Mini)."""
    return False

_supports_parallel_config

_supports_parallel_config(
    moe_parallel_config: FusedMoEParallelConfig,
) -> bool

Supports TRTLLM Kernel does not support EPLB.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
    """Supports TRTLLM Kernel does not support EPLB."""
    return not moe_parallel_config.enable_eplb

_supports_quant_scheme

_supports_quant_scheme(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
) -> bool

Supports Fp8 per-tensor and Fp8 block.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_quant_scheme(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
) -> bool:
    """Supports Fp8 per-tensor and Fp8 block."""
    SUPPORTED_W_A = [
        (kFp8Static128BlockSym, kFp8Dynamic128Sym),
        (kFp8StaticTensorSym, kFp8StaticTensorSym),
    ]
    return (weight_key, activation_key) in SUPPORTED_W_A

_supports_router_logits_dtype

_supports_router_logits_dtype(
    router_logits_dtype: dtype | None,
    routing_method: RoutingMethodType,
) -> bool

The FlashInfer TRTLLM FP8 kernel expects bfloat16 router_logits by default. Only DeepSeekV3 routing supports float32 router_logits (which is converted internally in the kernel).

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_router_logits_dtype(
    router_logits_dtype: torch.dtype | None,
    routing_method: RoutingMethodType,
) -> bool:
    """
    The FlashInfer TRTLLM FP8 kernel expects bfloat16 router_logits by default.
    Only DeepSeekV3 routing supports float32 router_logits (which is converted
    internally in the kernel).
    """
    if router_logits_dtype == torch.float32:
        # Only DeepSeekV3 routing handles float32 logits
        # https://github.com/flashinfer-ai/flashinfer/issues/2469
        return routing_method == RoutingMethodType.DeepSeekV3
    return True

_supports_routing_method

_supports_routing_method(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
    routing_method: RoutingMethodType,
) -> bool

Monolithic kernels need to express router support.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def _supports_routing_method(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
    routing_method: RoutingMethodType,
) -> bool:
    """Monolithic kernels need to express router support."""
    # NOTE(dbari): TopK routing could also be enabled, but need to validate models
    # NOTE(dbari): Default is not implemented and should not be enabled until it is
    if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym):
        # NOTE(rob): potentially allow others here. This is a conservative list.
        return routing_method in [
            RoutingMethodType.DeepSeekV3,
            RoutingMethodType.Renormalize,
            RoutingMethodType.RenormalizeNaive,
        ]
    elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
        # NOTE(dbari): as above, potentially allow others here.
        return routing_method in [
            RoutingMethodType.Llama4,
            RoutingMethodType.Renormalize,
            RoutingMethodType.RenormalizeNaive,
        ]
    else:
        raise ValueError("Unsupported quantization scheme.")

is_supported_config_trtllm_bf16

is_supported_config_trtllm_bf16(
    moe_config: FusedMoEConfig,
    activation_format: FusedMoEActivationFormat,
) -> tuple[bool, str | None]

This method mirrors mk.FusedMoEPermuteExpertsUnpermute.is_supported_config for BF16 unquantized kernels.

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def is_supported_config_trtllm_bf16(
    moe_config: FusedMoEConfig,
    activation_format: mk.FusedMoEActivationFormat,
) -> tuple[bool, str | None]:
    """
    This method mirrors mk.FusedMoEPermuteExpertsUnpermute.is_supported_config
    for BF16 unquantized kernels.
    """

    def _make_reason(reason: str) -> str:
        return f"kernel does not support {reason}"

    if not _supports_current_device():
        return False, _make_reason("current device")
    elif not (moe_config.is_act_and_mul or _supports_no_act_and_mul()):
        return False, _make_reason("no act_and_mul MLP layer")
    elif not _supports_activation(moe_config.activation):
        return False, _make_reason(f"{moe_config.activation} activation")
    elif not _supports_parallel_config(moe_config.moe_parallel_config):
        return False, _make_reason("parallel config")
    elif not _supports_routing_method_bf16(moe_config.routing_method):
        return False, _make_reason("routing method")
    elif activation_format != mk.FusedMoEActivationFormat.Standard:
        return False, _make_reason("activation format")

    return True, None

is_supported_config_trtllm_fp8

is_supported_config_trtllm_fp8(
    moe_config: FusedMoEConfig,
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
    activation_format: FusedMoEActivationFormat,
) -> tuple[bool, str | None]

This method mirrors mk.FusedMoEPermuteExpertsUnpermute.is_supported_config

Source code in vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
def is_supported_config_trtllm_fp8(
    moe_config: FusedMoEConfig,
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
    activation_format: mk.FusedMoEActivationFormat,
) -> tuple[bool, str | None]:
    """
    This method mirrors mk.FusedMoEPermuteExpertsUnpermute.is_supported_config
    """

    def _make_reason(reason: str) -> str:
        return f"kernel does not support {reason}"

    if not _supports_current_device():
        return False, _make_reason("current device")
    elif not (moe_config.is_act_and_mul or _supports_no_act_and_mul()):
        return False, _make_reason("no act_and_mul MLP layer")
    elif not _supports_activation(moe_config.activation):
        return False, _make_reason(f"{moe_config.activation} activation")
    elif not _supports_quant_scheme(weight_key, activation_key):
        return False, _make_reason("quantization scheme")
    elif not _supports_parallel_config(moe_config.moe_parallel_config):
        return False, _make_reason("parallel config")
    elif not _supports_routing_method(
        weight_key, activation_key, moe_config.routing_method
    ):
        return False, _make_reason("routing method")
    elif activation_format != mk.FusedMoEActivationFormat.Standard:
        return False, _make_reason("activation format")
    elif not _supports_router_logits_dtype(
        moe_config.router_logits_dtype, moe_config.routing_method
    ):
        return False, _make_reason("float32 router_logits with non-DeepSeekV3 routing")

    return True, None