microsoft · LeiWang1999 · Dec 8, 2024 · Nov 10, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/__init__.py b/bitblas/__init__.py
@@ -133,7 +133,6 @@ def remove_tvm_path(path):
         logger.warning(CUTLASS_NOT_FOUND_MESSAGE)
 
 import tvm as tvm  # noqa: E402
-from . import gpu  # noqa: F401
 from .base import (
     TileDevice,  # noqa: F401
     fast_tune,  # noqa: F401
@@ -148,7 +147,6 @@ def remove_tvm_path(path):
     ApplyDefaultSchedule,  # noqa: F401
     ApplyFastTuning,  # noqa: F401
 )
-from . import testing  # noqa: F401
 from .utils import auto_detect_nvidia_target, apply_transform_on_input  # noqa: F401
 from .ops.general_matmul import MatmulConfig, Matmul  # noqa: F401
 from .ops.general_matmul_splitk import MatmulConfigWithSplitK, MatmulWithSplitK  # noqa: F401

diff --git a/bitblas/base/__init__.py b/bitblas/base/__init__.py
@@ -11,7 +11,9 @@
     normalize_prim_func,  # noqa: F401
 )  # noqa: F401
 from .common_schedules import get_block, get_output_blocks, try_inline, try_inline_contiguous_spatial  # noqa: F401
+from .base_scheduler import simplify_prim_func  # noqa: F401
 from .schedule_rule import ScheduleRule  # noqa: F401
-from .utils import fast_tune, fast_tune_with_dynamic_range  # noqa: F401
+from .tuner import fast_tune, fast_tune_with_dynamic_range  # noqa: F401
 from .roller import *
 from .arch import CUDA, CDNA  # noqa: F401
+from .operator_common import TransformKind, OptimizeStrategy, BackendKind  # noqa: F401
diff --git a/bitblas/base/arch/__init__.py b/bitblas/base/arch/__init__.py
@@ -4,9 +4,13 @@
 from .cuda import *
 from .cpu import *
 from .cdna import *
+from typing import Union
 
 
-def get_arch(target: tvm.target.Target) -> TileDevice:
+def get_arch(target: Union[str, tvm.target.Target] = "cuda") -> TileDevice:
+    if isinstance(target, str):
+        target = tvm.target.Target(target)
+
     if target.kind.name == "cuda":
         return CUDA(target)
     elif target.kind.name == "llvm":
@@ -17,16 +21,63 @@ def get_arch(target: tvm.target.Target) -> TileDevice:
         raise ValueError(f"Unsupported target: {target.kind.name}")
 
 
+def auto_infer_current_arch() -> TileDevice:
+    # TODO(lei): This is a temporary solution to infer the current architecture
+    # Can be replaced by a more sophisticated method in the future
+    return get_arch("cuda")
+
+
+def is_cpu_arch(arch: TileDevice) -> bool:
+    return isinstance(arch, CPU)
+
+
+def is_cuda_arch(arch: TileDevice) -> bool:
+    return isinstance(arch, CUDA)
+
+
 def is_ampere_arch(arch: TileDevice) -> bool:
     conditions = [True]
-    conditions.append(isinstance(arch, CUDA))
-    conditions.append(arch.sm_version >= 80)
+    conditions.append(is_cuda_arch(arch))
+    conditions.append(arch.sm_version >= 80 and arch.sm_version < 90)
     return all(conditions)
 
 
 def is_volta_arch(arch: TileDevice) -> bool:
     conditions = [True]
-    conditions.append(isinstance(arch, CUDA))
+    conditions.append(is_cuda_arch(arch))
     conditions.append(arch.sm_version >= 70)
     conditions.append(arch.sm_version < 80)
     return all(conditions)
+
+
+def is_cdna_arch(arch: TileDevice) -> bool:
+    return isinstance(arch, CDNA)
+
+
+def has_mma_support(arch: TileDevice) -> bool:
+    conditions = [True]
+    conditions.append(is_cuda_arch(arch))
+    conditions.append(arch.sm_version >= 80)
+    return all(conditions)
+
+
+def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
+    volta_tensorcore_supported = [
+        ("float16", "float32"),
+        ("float16", "float16"),
+    ]
+    ampere_tensorcore_supported = [
+        ("float16", "float32"),
+        ("float16", "float16"),
+        ("int8", "int32"),
+        ("int4", "int32"),
+        ("int2", "int32"),
+        ("int1", "int32"),
+    ]
+
+    if is_volta_arch(arch):
+        return (in_dtype, accum_dtype) in volta_tensorcore_supported
+    elif is_ampere_arch(arch):
+        return (in_dtype, accum_dtype) in ampere_tensorcore_supported
+    else:
+        raise ValueError(f"Unsupported architecture: {arch}")
diff --git a/bitblas/base/base_scheduler.py b/bitblas/base/base_scheduler.py
@@ -0,0 +1,148 @@
+from tvm import te
+from tvm import IRModule
+from tvm.tir import PrimFunc
+from typing import Optional, Union, Callable, List, Dict
+from dataclasses import dataclass, field
+from tvm.tl.transform import Simplify
+from abc import ABC, abstractmethod
+from bitblas.base.arch import TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch
+from bitblas.base.roller.hint import Hint
+from bitblas.tl.base_hint import BaseTLHint
+
+
+# Decorator to simplify the output of a function
+def maybe_simplify(self, func: Callable) -> Callable:
+
+    def wrapper(*args, **kwargs):
+        stmt: Union[PrimFunc, IRModule] = (func)(*args, **kwargs)
+        if self._enable_simplify:
+            return self.Simplify(stmt)
+        return stmt
+
+    return wrapper
+
+
+@dataclass
+class BaseScheduler(ABC):
+
+    _arch: TileDevice = field(default=auto_infer_current_arch(), init=False, repr=False)
+
+    _enable_simplify: bool = field(default=True, init=False, repr=False)
+
+    _dynamic_range: Dict[str, int] = field(default_factory=dict, init=False, repr=False)
+
+    @staticmethod
+    def Simplify(stmt: Union[PrimFunc, IRModule]) -> Union[PrimFunc, IRModule]:
+        if isinstance(stmt, PrimFunc):
+            mod = Simplify()(IRModule.from_expr(stmt))
+            assert len(mod.functions) == 1, "Simplify should return a single function"
+            return list(mod.functions.values()).pop()
+        elif isinstance(stmt, IRModule):
+            return Simplify()(stmt)
+        else:
+            raise ValueError(f"Unsupported type: {type(stmt)}")
+
+    def get_hardware_aware_configs(self,
+                                   arch: TileDevice = None,
+                                   topk: int = 10) -> List[BaseTLHint]:
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support hardware-aware tuning for {arch} with topk={topk}"
+        )
+
+    def activate_simplify(self) -> "BaseScheduler":
+        self._enable_simplify = True
+        return self
+
+    def deactivate_simplify(self) -> "BaseScheduler":
+        self._enable_simplify = False
+        return self
+
+    def maybe_simplify(self, stmt: Union[PrimFunc, IRModule]) -> Union[PrimFunc, IRModule]:
+        if self._enable_simplify:
+            return self.Simplify(stmt)
+        return stmt
+
+    def with_self_attrs(self, func: PrimFunc) -> PrimFunc:
+        if self._dynamic_range:
+            func = func.with_attr("opt_shapes", self._dynamic_range)
+        return func
+
+    def post_process(self, func: PrimFunc) -> PrimFunc:
+        func = self.with_self_attrs(func)
+        func = self.maybe_simplify(func)
+        return func
+
+    def set_dynamic_range(self, dynamic_range: Dict[str, int]) -> "BaseScheduler":
+        self._dynamic_range = dynamic_range
+        return self
+
+    def has_dynamic_range(self) -> bool:
+        return bool(self._dynamic_range)
+
+    def with_arch(self, arch: TileDevice) -> "BaseScheduler":
+        self._arch = arch
+        return self
+
+    def has_arch(self) -> bool:
+        return self._arch is not None
+
+    def is_volta_arch(self) -> bool:
+        return is_volta_arch(self._arch) if self._arch is not None else False
+
+    def is_ampere_arch(self) -> bool:
+        return is_ampere_arch(self._arch) if self._arch is not None else False
+
+    def is_cdna_arch(self) -> bool:
+        return is_cdna_arch(self._arch) if self._arch is not None else False
+
+    @staticmethod
+    def maybe_dynamic(arg: Union[int, List[int]], dynamic_symbol: str = "m") -> PrimFunc:
+        if isinstance(arg, int):
+            return arg
+        return te.var(dynamic_symbol)
+
+    @abstractmethod
+    def with_default_config(self, *args, **kwargs) -> PrimFunc:
+        pass
+
+    @abstractmethod
+    def apply_config(
+        self,
+        *args,
+        **kwargs,
+    ) -> PrimFunc:
+        pass
+
+    def serialize_hints_to_configs(self, hints: List[Hint]) -> List[BaseTLHint]:
+        # Convert Roller Hints to TileLang Hints
+        raise NotImplementedError("Serialization of hints to configs is not implemented")
+
+    def specialize_from_dynamic_range(self,
+                                      dynamic_range: Optional[Dict[str,
+                                                                   int]] = None) -> "BaseScheduler":
+        raise NotImplementedError("Specialization from dynamic range is not implemented")
+
+    @property
+    def common_header(self) -> str:
+        # TODO(lei): For HIP Backend it should be different
+        common_header = "#include <tl_templates/cuda/common.h>\n"
+        return common_header
+
+    @property
+    def global_symbol(self):
+        # For kernel name generation
+        return "default"
+
+    @property
+    def arch(self) -> TileDevice:
+        return self._arch
+
+
+# Decorator to simplify the output of a function
+def simplify_prim_func(func: Callable) -> Callable:
+
+    def wrapper(*args, **kwargs):
+        stmt: Union[PrimFunc, IRModule] = (func)(*args, **kwargs)
+        return BaseScheduler.Simplify(stmt)
+
+    return wrapper
diff --git a/bitblas/base/operator_common.py b/bitblas/base/operator_common.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from enum import IntEnum
+
+
+class OptimizeStrategy(IntEnum):
+    SingleBatchDecodeOnly = 0
+    ContigousBatching = 1
+
+    def is_single_batch_decode_only(self):
+        return self == OptimizeStrategy.SingleBatchDecodeOnly
+
+    def is_contigous_batching(self):
+        return self == OptimizeStrategy.ContigousBatching
+
+
+class TransformKind(IntEnum):
+    NonTransform = 0
+    InterWarpTransform = 1
+    IntraWarpTransform = 2
+    LDMatrixTransform = 3
+
+    def is_non_transform(self):
+        return self == TransformKind.NonTransform
+
+    def is_inter_warp_transform(self):
+        return self == TransformKind.InterWarpTransform
+
+    def is_intra_warp_transform(self):
+        return self == TransformKind.IntraWarpTransform
+
+    def is_ld_matrix_transform(self):
+        return self == TransformKind.LDMatrixTransform
+
+
+class BackendKind(IntEnum):
+    TIR = 0
+    TileLang = 1
+
+    def is_tir_backend(self):
+        return self == BackendKind.TIR
+
+    def is_tilelang_backend(self):
+        return self == BackendKind.TileLang
+
+
+class QuantizationMemoryStage(IntEnum):
+    # Represents in which stage the dequantize operation is performed
+    #
+    # 1. For devices without async copy, we can use a simple dequantize schedule
+    # without shared memory prefetch.
+    #     quantized weight
+    #         |
+    #         V
+    #     dequantized in register
+    #         |
+    #         V
+    #     save into shared memory
+    #         |
+    #         V
+    #     compute
+    #
+    # 2. For A100 Like devices, the shared memory prefetch(async) is required
+    # to achieve optimal performance.
+    #     quantized weight
+    #         |
+    #         V
+    #     shared memory prefetch (with async copy)
+    #         |
+    #         V
+    #     dequantized into shared memory
+    #         |
+    #         V
+    #     compute
+    # 3. For A100 Like devices, the shared memory prefetch(async) is required
+    # to achieve optimal performance.
+    #     quantized weight
+    #         |
+    #         V
+    #     shared memory prefetch (with async copy)
+    #         |
+    #         V
+    #     LDMatrix into warp memory
+    #         |
+    #         V
+    #     Dequantize
+    #         |
+    #         V
+    #     Compute
+    Local = 0
+    Shared = 1
+    Global = 2
+
+    def is_quant_memory_in_local(self):
+        return self == QuantizationMemoryStage.Local
+
+    def is_quant_memory_in_shared(self):
+        return self == QuantizationMemoryStage.Shared
+
+    def is_quant_memory_in_global(self):
+        return self == QuantizationMemoryStage.Global