From 0e36bd8b091fb70bb1c7405fceafa85ef6c081d7 Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Sat, 2 Apr 2022 04:24:43 -0400
Subject: [PATCH 1/7] Add cut.

---
 python/jittor/compile_extern.py               |  53 +++++++
 python/jittor/compiler.py                     |   2 +-
 .../cuda/cutlass/ops/cutlass_matmul_op.cc     | 134 ++++++++++++++++++
 .../cuda/cutlass/ops/cutlass_matmul_op.h      |  25 ++++
 python/jittor/src/opt/tuner/matmul_tuner.cc   |   2 +-
 5 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.cc
 create mode 100644 python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.h

diff --git a/python/jittor/compile_extern.py b/python/jittor/compile_extern.py
index c43cdfd7..1e56c183 100644
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@@ -563,6 +563,58 @@ def inner(self, *args, **kw):
         if k == "mpi_test": continue
         setattr(core.Var, k, wrapper(mpi_ops.__dict__[k]))
 
+def install_cutlass(root_folder):
+    url = "https://cloud.tsinghua.edu.cn/f/e3dbf620495f48458488/?dl=1"
+
+    filename = "cutlass-master.zip"
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, "cutlass-master")
+    true_md5 = "95d5ca8f981a3694ec3efd658084fa3b"
+
+    if os.path.exists(fullname):
+        md5 = run_cmd('md5sum '+fullname).split()[0]
+        if md5 != true_md5:
+            os.remove(fullname)
+            if os.path.isdir(dirname):
+                shutil.rmtree(dirname)
+    if not os.path.isfile(os.path.join(dirname, "include")):
+        if not os.path.isfile(os.path.join(root_folder, filename)):
+            LOG.i("Downloading cutlass...")
+        download_url_to_local(url, filename, root_folder, true_md5)
+
+        if core.get_device_count() == 0:
+            return
+
+        shutil.unpack_archive(fullname, root_folder)
+    return dirname
+
+def setup_cutlass():
+    use_cutlass = os.environ.get("use_cutlass", "1")=="1"
+    if not has_cuda:
+        use_cutlass = False
+        return
+    if not use_cutlass: return
+    cutlass_include_path = os.environ.get("cutlass_include_path")
+    if cutlass_include_path is None:
+        LOG.v("setup cutlass...")
+        from pathlib import Path
+        cutlass_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutlass")
+        
+        make_cache_dir(cutlass_path)
+        cutlass_home = install_cutlass(cutlass_path)
+        if cutlass_home is None: return
+        cutlass_include_path = os.path.join(cutlass_home, "include")
+        cutlass_tool_include_path = os.path.join(cutlass_home, "tools", "util", "include")
+        all_dir = f" -I\"{cutlass_include_path}\" -I\"{cutlass_tool_include_path}\""
+    cutlass_src_dir = os.path.join(jittor_path, "extern", "cuda", "cutlass")
+    cutlass_src_files = []
+    for r, _, f in os.walk(cutlass_src_dir):
+        for fname in f:
+            cutlass_src_files.append(os.path.join(r, fname))
+    cutlass_ops = compile_custom_ops(cutlass_src_files, 
+        extra_flags=f" {all_dir} ")
+    LOG.vv("Get cutlass_ops: "+str(dir(cutlass_ops)))
+
 if os.environ.get("FIX_TORCH_ERROR", "0") == "1":
     try:
         import torch
@@ -577,6 +629,7 @@ def inner(self, *args, **kw):
 setup_nccl()
 
 setup_cutt()
+setup_cutlass()
 
 try:
     setup_mkl()
diff --git a/python/jittor/compiler.py b/python/jittor/compiler.py
index 54409ac5..10c045b2 100644
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@@ -686,7 +686,7 @@ def compile_custom_ops(
         bname = os.path.splitext(bname)[0]
         if bname.endswith("_op"):
             bname = bname[:-3]
-            if name.endswith(".cc"):
+            if name.endswith(".cc") or name.endswith(".cu"):
                 srcs[bname] = name
             elif name.endswith(".h"):
                 includes.append(os.path.dirname(name))
diff --git a/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.cc b/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.cc
new file mode 100644
index 00000000..61b50486
--- /dev/null
+++ b/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.cc
@@ -0,0 +1,134 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: 
+//     Guowei Yang <471184555@qq.com>
+//     Dun Liang <randonlang@gmail.com>. 
+// 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+
+#ifdef JIT
+#include <cutlass/gemm/device/gemm.h>
+#endif
+#include "var.h"
+#include "cutlass_matmul_op.h"
+
+using namespace std;
+
+namespace jittor {
+
+extern int use_tensorcore;
+
+#ifndef JIT
+
+CutlassMatmulOp::CutlassMatmulOp(Var* a, Var* b, bool trans_a, bool trans_b)
+    : a(a), b(b), trans_a(trans_a), trans_b(trans_b) {
+    flags.set(NodeFlags::_cuda, 1);
+    flags.set(NodeFlags::_cpu, 0);
+    // TODO: support int8 * int8
+    ASSERT(a->dtype().is_float() && b->dtype().is_float()) << "type of two inputs should be the same";
+    // TODO: support diffrent input type
+    ASSERT(a->dtype().dsize() == b->dtype().dsize()) << "type of two inputs should be the same";
+    c = create_output(nullptr, a->dtype());
+}
+
+void CutlassMatmulOp::infer_shape() {
+    ASSERTop(a->shape.size(),==,2);
+    ASSERTop(b->shape.size(),==,2);
+    int n = a->shape[0], m = a->shape[1];
+    int m_ = b->shape[0], k = b->shape[1];
+    if (trans_a) {
+        swap(n, m);
+    }
+    if (trans_b) {
+        swap(m_, k);
+    }
+    ASSERTop(m,==,m_);
+    c->set_shape({n, k});
+}
+
+void CutlassMatmulOp::jit_prepare(JK& jk) {
+    jk << _CS("[T:") << a->dtype();
+    jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
+    jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
+    jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
+    jk << ']';
+}
+
+#else // JIT
+#pragma clang diagnostic ignored "-Wtautological-compare"
+
+
+static inline cudaError_t CutlassSgemmNN(
+  int M,
+  int N,
+  int K,
+  float alpha,
+  float const *A,
+  int lda,
+  float const *B,
+  int ldb,
+  float beta,
+  float *C,
+  int ldc) {
+
+  // Define type definition for single-precision CUTLASS GEMM with column-major
+  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  //
+  // To keep the interface manageable, several helpers are defined for plausible compositions
+  // including the following example for single-precision GEMM. Typical values are used as
+  // default template arguments. See `cutlass/gemm/device/default_gemm_configuration.h` for more details.
+  //
+  // To view the full gemm device API interface, see `cutlass/gemm/device/gemm.h`
+
+  using ColumnMajor = cutlass::layout::ColumnMajor;
+  using RowMajor = cutlass::layout::RowMajor;
+  using CutlassGemm = cutlass::gemm::device::Gemm<float,        // Data-type of A matrix
+                                                  RowMajor,  // Layout of A matrix
+                                                  float,        // Data-type of B matrix
+                                                  RowMajor,  // Layout of B matrix
+                                                  float,        // Data-type of C matrix
+                                                  RowMajor>; // Layout of C matrix
+
+  CutlassGemm gemm_operator;
+
+  CutlassGemm::Arguments args({M, N, K},  // Gemm Problem dimensions
+                              {A, lda},    // Tensor-ref for source matrix A
+                              {B, ldb},    // Tensor-ref for source matrix B
+                              {C, ldc},    // Tensor-ref for source matrix C
+                              {C, ldc},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
+                              {alpha, beta}); // Scalars used in the Epilogue
+
+  cutlass::Status status = gemm_operator(args);
+  if (status != cutlass::Status::kSuccess) {
+    return cudaErrorUnknown;
+  }
+  return cudaSuccess;
+}
+
+void CutlassMatmulOp::jit_run() {
+    const T alpha = 1.0f;
+    const T beta  = 0.0f;
+    LOGi << "herher";
+    const auto& as = a->shape;
+    const auto& bs = b->shape;
+    auto n = as[0];
+    auto m = as[1];
+    auto k = bs[1];
+    if ('@Trans_a'=='T') {
+        n = as[1];
+        m = as[0];
+    }
+    if ('@Trans_b'=='T') {
+        k = bs[0];
+    }
+    using ColumnMajor = cutlass::layout::ColumnMajor;
+    using RowMajor = cutlass::layout::RowMajor;
+    // a: [n,m], b: [m,k], c: [n,k]
+    cudaError_t result = CutlassSgemmNN(n, k, m, alpha, a->ptr<T>(), '@Trans_a' == 'N' ? m : n, b->ptr<T>(), '@Trans_b' == 'N' ? k : m, beta, c->ptr<T>(), k);    
+    assert(result == cudaSuccess);
+}
+#endif // JIT
+
+} // jittor
diff --git a/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.h b/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.h
new file mode 100644
index 00000000..b6cc0f75
--- /dev/null
+++ b/python/jittor/extern/cuda/cutlass/ops/cutlass_matmul_op.h
@@ -0,0 +1,25 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: 
+//     Guoye Yang <498731903@qq.com>
+//     Dun Liang <randonlang@gmail.com>. 
+// 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "op.h"
+
+namespace jittor {
+
+struct CutlassMatmulOp : Op {
+    Var* a, * b, * c;
+    bool trans_a, trans_b;
+    CutlassMatmulOp(Var* a, Var* b, bool trans_a, bool trans_b);
+    
+    const char* name() const override { return "cutlass_matmul"; }
+    void infer_shape() override;
+    DECLARE_jit_run;
+};
+
+} // jittor
\ No newline at end of file
diff --git a/python/jittor/src/opt/tuner/matmul_tuner.cc b/python/jittor/src/opt/tuner/matmul_tuner.cc
index 43e47e8a..7c2d8386 100644
--- a/python/jittor/src/opt/tuner/matmul_tuner.cc
+++ b/python/jittor/src/opt/tuner/matmul_tuner.cc
@@ -87,7 +87,7 @@ void MatmulTuner::run(PassManager* pm, TunerManager* tm) {
             if (xx->dtype().dsize() != 4) continue;
 
         string relay_matmul_name = fop->flags.get(NodeFlags::_cpu) ?
-            "mkl_matmul" : "cublas_matmul";
+            "mkl_matmul" : "cutlass_matmul";
         if (!has_op(relay_matmul_name))
             return;
         auto make_matmul = get_op_info(relay_matmul_name)

From 4a8638c09dbd53f76aea1a5d0cab39a7e6f53d10 Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Wed, 6 Apr 2022 22:01:20 -0400
Subject: [PATCH 2/7] add cutlass support.

---
 python/jittor/compile_extern.py             |  11 +-
 python/jittor/compiler.py                   |   2 +-
 python/jittor/cutlass_depthwise_conv.py     | 771 ++++++++++++++++++++
 python/jittor/nn.py                         |   6 +-
 python/jittor/src/opt/tuner/matmul_tuner.cc |   2 +-
 5 files changed, 784 insertions(+), 8 deletions(-)
 create mode 100644 python/jittor/cutlass_depthwise_conv.py

diff --git a/python/jittor/compile_extern.py b/python/jittor/compile_extern.py
index a252e765..11a6af20 100644
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@@ -561,12 +561,12 @@ def inner(self, *args, **kw):
         setattr(core.Var, k, wrapper(mpi_ops.__dict__[k]))
 
 def install_cutlass(root_folder):
-    url = "https://cloud.tsinghua.edu.cn/f/e3dbf620495f48458488/?dl=1"
+    url = "https://cloud.tsinghua.edu.cn/f/8fc42499904f43e39141/?dl=1"
 
-    filename = "cutlass-master.zip"
+    filename = "cutlass.zip"
     fullname = os.path.join(root_folder, filename)
-    dirname = os.path.join(root_folder, "cutlass-master")
-    true_md5 = "95d5ca8f981a3694ec3efd658084fa3b"
+    dirname = os.path.join(root_folder, "cutlass")
+    true_md5 = "41bb524a6bad4612d6017ed4b11f1d28"
 
     if os.path.exists(fullname):
         md5 = run_cmd('md5sum '+fullname).split()[0]
@@ -586,7 +586,7 @@ def install_cutlass(root_folder):
     return dirname
 
 def setup_cutlass():
-    use_cutlass = os.environ.get("use_cutlass", "1")=="1"
+    use_cutlass = os.environ.get("use_cutlass", "0")=="1"
     if not has_cuda:
         use_cutlass = False
         return
@@ -600,6 +600,7 @@ def setup_cutlass():
         make_cache_dir(cutlass_path)
         cutlass_home = install_cutlass(cutlass_path)
         if cutlass_home is None: return
+        os.environ['cutlass_include_path'] = cutlass_home
         cutlass_include_path = os.path.join(cutlass_home, "include")
         cutlass_tool_include_path = os.path.join(cutlass_home, "tools", "util", "include")
         all_dir = f" -I\"{cutlass_include_path}\" -I\"{cutlass_tool_include_path}\""
diff --git a/python/jittor/compiler.py b/python/jittor/compiler.py
index 26bc3b1c..a61419ad 100644
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@@ -693,7 +693,7 @@ def compile_custom_ops(
         bname = os.path.splitext(bname)[0]
         if bname.endswith("_op"):
             bname = bname[:-3]
-            if name.endswith(".cc") or name.endswith(".cu"):
+            if name.endswith(".cc"):
                 srcs[bname] = name
             elif name.endswith(".h"):
                 includes.append(os.path.dirname(name))
diff --git a/python/jittor/cutlass_depthwise_conv.py b/python/jittor/cutlass_depthwise_conv.py
new file mode 100644
index 00000000..b3a29be2
--- /dev/null
+++ b/python/jittor/cutlass_depthwise_conv.py
@@ -0,0 +1,771 @@
+import jittor as jt
+import numpy as np
+from jittor import nn
+import time
+import sys
+import os
+
+cutlass_path = os.environ.get('cutlass_include_path')
+def depthwise_src_backward(x, weights):
+    cuda_header = '''
+    #undef out
+    #include <cutlass/cutlass.h>
+
+    #include <cutlass/numeric_types.h>
+    #include <cutlass/gemm/device/gemm.h>
+
+    #include <cutlass/util/host_tensor.h>
+
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/convolution/device/convolution.h>
+
+    #include <cutlass/util/command_line.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/util/tensor_view_io.h>
+    #include <cutlass/util/reference/device/gemm.h>
+    #include <cutlass/util/reference/host/tensor_compare.h>
+    #include <cutlass/util/reference/host/tensor_copy.h>
+    #include <cutlass/util/reference/host/tensor_fill.h>
+    #include <cutlass/util/reference/host/convolution.h>
+    #include <cutlass/util/tensor_view_io.h>
+
+    #include "executor.h"
+
+    #include <iostream>
+
+
+    // The code section below describes datatype for input, output tensors and
+    // computation between elements 
+    using ElementAccumulator = float;  // Data type of accumulator
+    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+    using ElementSrc = float;     // Data type of elements in src tensor
+    using ElementFilter = float;  // Data type of elements in filter tensor
+    using ElementDst = float;     // Data type of elements in output tensor
+
+    using LayoutSrc = cutlass::layout::TensorNCHW;
+    using LayoutFilter = cutlass::layout::TensorNCHW;
+    using LayoutDst = cutlass::layout::TensorNCHW;
+
+    // This code section describes whether you want to use tensor cores or regular
+    // SIMT cores on GPU SM
+    using MMAOp = cutlass::arch::OpClassSimt;
+
+    // This code section describes CUDA SM architecture number
+    using SmArch = cutlass::arch::Sm75; 
+
+    // This code section describes the tile size a thread block will compute
+    using ThreadblockShape =
+            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+    // This code section describes tile size a warp will compute
+    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
+
+    // This code section describes the size of MMA op
+    using InstructionShape =
+            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape 
+
+    // This code section describes how threadblocks are scheduled on GPU
+    using SwizzleThreadBlock =
+            cutlass::conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle;
+
+    // Number of pipelines you want to use
+    constexpr int NumStages = 2;
+
+    // This code section describes the epilogue part of the kernel, we use default
+    // value
+    using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
+            ElementDst,               // Data type of output matrix.
+            1, ElementAccumulator,    // Data type of accumulator
+            ElementDst,               // Data type of bias
+            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                    // combination
+    using Convolution = cutlass::conv::device::Deconvolution<
+            ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
+            LayoutDst, ElementDst, LayoutDst, ElementDst,
+            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+            SwizzleThreadBlock, NumStages, 1, 1,
+            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
+            cutlass::conv::ImplicitGemmMode::GEMM_TN>;
+
+    struct Options {
+        bool help;
+        cutlass::Tensor4DCoord input_size;
+        cutlass::Tensor4DCoord filter_size;
+        cutlass::Tensor4DCoord padding;
+        cutlass::MatrixCoord conv_stride;
+        cutlass::MatrixCoord dilation;
+        bool reference_check;
+        bool measure_performance;
+        int iterations;
+        bool save_workspace;
+        ElementComputeEpilogue alpha;
+        ElementComputeEpilogue beta;
+        bool benchmark;
+        std::string tag;
+
+        Options()
+                : help(false),
+                input_size(1, 32, 32, 32),
+                filter_size(32, 3, 3, 1),
+                padding(1, 1, 1, 1),
+                conv_stride(1, 1),
+                dilation(1, 1),
+                reference_check(false),
+                measure_performance(true),
+                iterations(1000),
+                save_workspace(false),
+                alpha(1),
+                beta(0),
+                benchmark(false) {}
+
+            // Verify the problem size is compatible with the CUTLASS Convolution
+        // implementation.
+        bool valid() {
+            int const kAlignment = 1;
+
+            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                // misaligned tensors
+                return false;
+            }
+
+            // Invalid padding
+            if ((padding.h() != filter_size.h() / 2) ||
+                (padding.w() != filter_size.w() / 2)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        /// Updates input and filter sizes
+        void update(cutlass::Tensor4DCoord input_size,
+                    cutlass::Tensor4DCoord filter_size) {
+            this->input_size = input_size;
+            this->filter_size = filter_size;
+
+            padding.n() = filter_size.h() / 2;
+            padding.h() = filter_size.h() / 2;
+            padding.w() = filter_size.w() / 2;
+            padding.c() = filter_size.w() / 2;
+        }
+
+        /// Computes the output tensor size (NPQK)
+        cutlass::Tensor4DCoord output_size() const {
+            return cutlass::Tensor4DCoord(
+                    input_size.n(),
+                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                    conv_stride.row() +
+                            1,
+                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                    conv_stride.column() +
+                            1,
+                    filter_size.n());
+        }
+
+        /// Compute performance in GFLOP/s
+        double gflops(double runtime_s) const {
+            // Number of multiply-adds = NPQK * CRS / K
+            int64_t fmas =
+                    output_size().product() *
+                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                    output_size().c();
+
+            // Two flops per multiply-add
+            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+        }
+
+    };
+
+    #define CUTLASS_CHECK(status)                                                 \
+        {                                                                         \
+            cutlass::Status error = status;                                       \
+            if (error != cutlass::Status::kSuccess) {                             \
+                std::cerr << "Got cutlass error: "                                \
+                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                        << std::endl;                                           \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CUDA_CHECK(status)                                                    \
+        {                                                                         \
+            cudaError_t error = status;                                           \
+            if (error != cudaSuccess) {                                           \
+                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                        << " at line: " << __LINE__ << std::endl;               \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+    '''
+
+    cuda_src = '''
+    @alias(weights, in1)      
+    @alias(x, in0)
+    @alias(dst, out0)
+    bool notSupported = false;
+    Options options = Options(); 
+    options.update(  
+        {x_shape0, x_shape2, x_shape3, x_shape1},
+        {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+        LayoutSrc().packed({options.input_size}));
+    cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
+        LayoutFilter().packed(options.filter_size));
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    int split_k_slices = 1;    
+    typename Convolution::Arguments arguments{
+            {options.input_size, options.filter_size, options.padding,
+            options.conv_stride, options.dilation, options.output_size(), mode,
+            split_k_slices, options.filter_size.n()},
+            d_src,      // tensor_src.device_ref(),
+            d_filter,   // tensor_filter.device_ref(),
+            d_bias,       // tensor_bias.device_ref(),
+            d_z,       // tensor_z.device_ref(),
+            d_dst,      // tensor_dst.device_ref(),
+            {options.alpha, 0, options.beta}};
+
+    Convolution conv_op;  
+
+    size_t workspace_size = conv_op.get_workspace_size(arguments);
+
+    // Allocate workspace memory  
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+    // auto temp1 = exe.alloc_temp(workspace_size);
+    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+    {
+        //static SimpleProfiler _("aa");
+        //SimpleProfilerGuard __(_);        
+
+    //ccccc   c 
+    CUTLASS_CHECK(conv_op());      
+    // kernel<<<1,1>>>();      
+    }
+    '''
+    output = jt.zeros(x.shape)
+    output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+    return output
+
+def depthwise_filter_backward(x, weights, diff):
+    cuda_header = '''
+    #undef out
+    #include <cutlass/cutlass.h>
+    #include <cutlass/numeric_types.h>
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/convolution/device/convolution.h>
+    #include <cutlass/util/command_line.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/util/tensor_view_io.h>
+    #include <cutlass/util/reference/device/gemm.h>
+    #include <cutlass/util/reference/host/tensor_compare.h>
+    #include <cutlass/util/reference/host/tensor_copy.h>
+    #include <cutlass/util/reference/host/tensor_fill.h>
+    #include <cutlass/util/reference/host/convolution.h>
+    #include <cutlass/util/tensor_view_io.h>
+
+    #include "executor.h"
+
+    #include <iostream>
+
+    // The code section below describes datatype for input, output tensors and
+    // computation between elements
+    using ElementAccumulator = float;  // Data type of accumulator
+    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+    using ElementSrc = float;     // Data type of elements in src tensor
+    using ElementFilter = float;  // Data type of elements in filter tensor
+    using ElementDst = float;     // Data type of elements in output tensor
+
+    using LayoutSrc = cutlass::layout::TensorNCHW;
+    using LayoutFilter = cutlass::layout::TensorNCHW;
+    using LayoutDst = cutlass::layout::TensorNCHW;
+    using LayoutGrad = cutlass::layout::TensorNCHW;
+
+    // This code section describes whether you want to use tensor cores or regular
+    // SIMT cores on GPU SM
+    using MMAOp = cutlass::arch::OpClassSimt;
+
+    // This code section describes CUDA SM architecture number
+    using SmArch = cutlass::arch::Sm75;
+
+    // This code section describes the tile size a thread block will compute
+    using ThreadblockShape =
+            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+    // This code section describes tile size a warp will compute 
+    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape 
+
+    // This code section describes the size of MMA op
+    using InstructionShape =
+            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
+
+    // This code section describes how threadblocks are scheduled on GPU
+    using SwizzleThreadBlock =
+            cutlass::conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle;
+
+    // Number of pipelines you want to use
+    constexpr int NumStages = 2;
+
+    // This code section describes the epilogue part of the kernel, we use default
+    // value
+    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+            ElementFilter,               // Data type of output matrix.
+            1, ElementAccumulator,    // Data type of accumulator
+            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                    // combination
+    using Convolution = cutlass::conv::device::ConvolutionBackwardFilter<
+            ElementSrc, LayoutSrc, ElementDst, LayoutDst, ElementFilter,
+            LayoutFilter, ElementFilter,
+            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+            SwizzleThreadBlock, NumStages, 1, 1,
+            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
+            cutlass::conv::ImplicitGemmMode::GEMM_NT>;
+
+    struct Options {
+        bool help;
+        cutlass::Tensor4DCoord input_size;
+        cutlass::Tensor4DCoord filter_size;
+        cutlass::Tensor4DCoord padding;
+        cutlass::MatrixCoord conv_stride;
+        cutlass::MatrixCoord dilation;
+        bool reference_check;
+        bool measure_performance;
+        int iterations;
+        bool save_workspace;
+        ElementComputeEpilogue alpha;
+        ElementComputeEpilogue beta;
+        bool benchmark;
+        std::string tag;
+
+        Options()
+                : help(false),
+                input_size(1, 32, 32, 32),
+                filter_size(32, 3, 3, 1),
+                padding(1, 1, 1, 1),
+                conv_stride(1, 1),
+                dilation(1, 1),
+                reference_check(false),
+                measure_performance(true),
+                iterations(1000),
+                save_workspace(false),
+                alpha(1),
+                beta(0),
+                benchmark(false) {}
+
+            // Verify the problem size is compatible with the CUTLASS Convolution
+        // implementation.
+        bool valid() {
+            int const kAlignment = 1;
+
+            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                // misaligned tensors
+                return false;
+            }
+
+            // Invalid padding
+            if ((padding.h() != filter_size.h() / 2) ||
+                (padding.w() != filter_size.w() / 2)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        /// Updates input and filter sizes
+        void update(cutlass::Tensor4DCoord input_size,
+                    cutlass::Tensor4DCoord filter_size) {
+            this->input_size = input_size;
+            this->filter_size = filter_size;
+
+            padding.n() = filter_size.h() / 2;
+            padding.h() = filter_size.h() / 2;
+            padding.w() = filter_size.w() / 2;
+            padding.c() = filter_size.w() / 2;
+        }
+
+        /// Computes the output tensor size (NPQK)
+        cutlass::Tensor4DCoord output_size() const {
+            return cutlass::Tensor4DCoord(
+                    input_size.n(),
+                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                    conv_stride.row() +
+                            1,
+                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                    conv_stride.column() +
+                            1,
+                    filter_size.n());
+        }
+
+        /// Compute performance in GFLOP/s
+        double gflops(double runtime_s) const {
+            // Number of multiply-adds = NPQK * CRS / K
+            int64_t fmas =
+                    output_size().product() *
+                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                    output_size().c();
+
+            // Two flops per multiply-add
+            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+        }
+
+    };
+
+    #define CUTLASS_CHECK(status)                                                 \
+        {                                                                         \
+            cutlass::Status error = status;                                       \
+            if (error != cutlass::Status::kSuccess) {                             \
+                std::cerr << "Got cutlass error: "                                \
+                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                        << std::endl;                                           \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CUDA_CHECK(status)                                                    \
+        {                                                                         \
+            cudaError_t error = status;                                           \
+            if (error != cudaSuccess) {                                           \
+                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                        << " at line: " << __LINE__ << std::endl;               \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+    '''
+
+    cuda_src = '''
+    @alias(grad, in2)
+    @alias(weights, in1)      
+    @alias(x, in0)
+    @alias(dst, out0)
+    bool notSupported = false;
+    Options options = Options(); 
+    options.update(  
+        {x_shape0, x_shape2, x_shape3, x_shape1},
+        {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+        LayoutSrc().packed({options.input_size}));
+    cutlass::TensorRef<ElementDst, LayoutDst> d_diff((ElementFilter*)grad_p,
+        LayoutDst().packed(options.output_size()));
+    cutlass::TensorRef<typename Convolution::ElementGrad,
+                typename Convolution::LayoutGrad> d_filter((ElementFilter*)dst_p, LayoutFilter().packed(options.filter_size));
+
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    int split_k_slices = 1;    
+    typename Convolution::Arguments arguments{
+            {options.input_size, options.filter_size, options.padding,
+            options.conv_stride, options.dilation, options.output_size(), mode,
+            split_k_slices, options.filter_size.n()},
+            d_src,      // tensor_src.device_ref(),
+            d_diff,   // tensor_filter.device_ref(),
+            d_filter,
+            {options.alpha}};
+
+    Convolution conv_op;  
+
+    size_t workspace_size = conv_op.get_workspace_size(arguments);
+
+    // Allocate workspace memory  
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+    // auto temp1 = exe.alloc_temp(workspace_size);
+    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+    {
+        //static SimpleProfiler _("aa");
+        //SimpleProfilerGuard __(_);        
+
+    //ccccc   c 
+    CUTLASS_CHECK(conv_op());      
+    // kernel<<<1,1>>>();      
+    }
+    '''
+    output = jt.zeros(weights.shape)
+    output = jt.code([x, weights, diff], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+    return output
+
+class DepthwiseConv(jt.Function):
+    def __init__(self, stride=1, padding=0, dilation=1):
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+        self.x = None
+        self.weights = None
+
+    def execute(self, x, weights):
+        self.x = x
+        self.weights = weights
+        cuda_header = '''
+        #undef out
+        #include <iostream> 
+        #include <sstream> 
+        #include <device_launch_parameters.h>
+        #include <cutlass/cutlass.h>
+
+        #include <cutlass/numeric_types.h>
+        #include <cutlass/gemm/device/gemm.h>
+
+        #include <cutlass/util/host_tensor.h>
+
+        #include <cutlass/gemm/device/gemm.h>
+        #include <cutlass/convolution/device/convolution.h>
+
+        #include <cutlass/util/command_line.h>
+        #include <cutlass/util/host_tensor.h>
+        #include <cutlass/util/tensor_view_io.h>
+        #include <cutlass/util/reference/device/gemm.h>
+        #include <cutlass/util/reference/host/tensor_compare.h>
+        #include <cutlass/util/reference/host/tensor_copy.h>
+        #include <cutlass/util/reference/host/tensor_fill.h>
+        #include <cutlass/util/reference/host/convolution.h>
+        #include <cutlass/util/tensor_view_io.h>
+        #include "executor.h"
+
+        // The code section below describes datatype for input, output tensors and
+        // computation between elements
+        using ElementAccumulator = float;  // Data type of accumulator
+        using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+        using ElementSrc = float;     // Data type of elements in src tensor
+        using ElementFilter = float;  // Data type of elements in filter tensor
+        using ElementDst = float;     // Data type of elements in output tensor
+
+        using LayoutSrc = cutlass::layout::TensorNCHW;
+        using LayoutFilter = cutlass::layout::TensorNCHW;
+        using LayoutDst = cutlass::layout::TensorNCHW;
+
+        // This code section describes whether you want to use tensor cores or regular
+        // SIMT cores on GPU SM
+        using MMAOp = cutlass::arch::OpClassSimt;
+
+        // This code section describes CUDA SM architecture number
+        using SmArch = cutlass::arch::Sm75;
+
+        // This code section describes the tile size a thread block will compute
+        using ThreadblockShape = 
+                cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+        // This code section describes tile size a warp will compute   
+        using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
+
+        // This code section describes the size of MMA op 
+        using InstructionShape =   
+                cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
+
+        // This code section describes how threadblocks are scheduled on GPU
+        using SwizzleThreadBlock =
+                cutlass::conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle;
+            
+        // Number of pipelines you want to use
+        constexpr int NumStages = 1; 
+
+        // This code section describes the epilogue part of the kernel, we use default   
+        // value 
+        using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
+                ElementDst,               // Data type of output matrix.
+                1, ElementAccumulator,    // Data type of accumulator
+                ElementDst,               // Data type of bias
+                ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                        // combination
+        using Convolution = cutlass::conv::device::Convolution< 
+                ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
+                LayoutDst, ElementDst, LayoutDst, ElementDst,
+                cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+                ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+                SwizzleThreadBlock, NumStages, 1, 1,
+                cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,  
+                cutlass::conv::ImplicitGemmMode::GEMM_TN>; 
+
+        struct Options {
+            bool help; 
+            cutlass::Tensor4DCoord input_size;   
+            cutlass::Tensor4DCoord filter_size;
+            cutlass::Tensor4DCoord padding;
+            cutlass::MatrixCoord conv_stride;
+            cutlass::MatrixCoord dilation;
+            bool reference_check;
+            bool measure_performance;
+            int iterations;
+            bool save_workspace; 
+            ElementComputeEpilogue alpha;
+            ElementComputeEpilogue beta;
+            bool benchmark;
+            std::string tag;
+
+            Options()
+                    : help(false),
+                    input_size(1, 32, 32, 32),
+                    filter_size(32, 3, 3, 1),
+                    padding(1, 1, 1, 1),
+                    conv_stride(1, 1),
+                    dilation(1, 1),
+                    reference_check(false),
+                    measure_performance(false),
+                    iterations(1000),
+                    save_workspace(false),
+                    alpha(1),
+                    beta(0),
+                    benchmark(false) {}
+
+                // Verify the problem size is compatible with the CUTLASS Convolution
+            // implementation.
+            bool valid() {  
+                int const kAlignment = 1;   
+
+                if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                    // misaligned tensors
+                    return false;
+                }
+
+                // Invalid padding
+                if ((padding.h() != filter_size.h() / 2) ||
+                    (padding.w() != filter_size.w() / 2)) {
+                    return false;
+                }
+
+                return true;
+            }  
+
+            /// Updates   input  and   filter  sizes      
+            void update(cutlass::Tensor4DCoord input_size,
+                        cutlass::Tensor4DCoord filter_size) {
+                this->input_size = input_size;
+                this->filter_size = filter_size;
+
+                padding.n() = filter_size.h() / 2; 
+                padding.h() = filter_size.h() / 2;  
+                padding.w() = filter_size.w() / 2; 
+                padding.c() = filter_size.w() / 2;   
+            }  
+
+            /// Computes the output tensor size (NPQK)    
+            cutlass::Tensor4DCoord output_size() const {
+                return cutlass::Tensor4DCoord(
+                        input_size.n(),
+                        (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                        conv_stride.row() +
+                                1,
+                        (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                        conv_stride.column() +
+                                1,
+                        filter_size.n()); 
+            }
+
+            /// Compute performance in GFLOP/s 
+            double gflops(double runtime_s) const {
+                // Number of multiply-adds = NPQK * CRS / K
+                int64_t fmas =
+                        output_size().product() *
+                        int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                        output_size().c();
+
+                // Two flops per multiply-add
+                return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+            }
+
+        };
+
+        #define CUTLASS_CHECK(status)                                                 \
+            {                                                                         \
+                cutlass::Status error = status;                                       \
+                if (error != cutlass::Status::kSuccess) {                             \
+                    std::cerr << "Got cutlass error: "                                \
+                            << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                            << std::endl;                                           \
+                    exit(EXIT_FAILURE);                                               \
+                }                                                                     \
+            }
+
+        #define CUDA_CHECK(status)                                                    \
+            {                                                                         \
+                cudaError_t error = status;                                           \
+                if (error != cudaSuccess) {                                           \
+                    std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                            << " at line: " << __LINE__ << std::endl;               \
+                    exit(EXIT_FAILURE);                                               \
+                }                                                                     \
+            }
+
+        '''
+        cuda_src = '''
+            // __global__ void kernel() {}
+            @alias(weights, in1)      
+            @alias(x, in0)
+            @alias(dst, out0)
+            bool notSupported = false;
+            Options options = Options(); 
+            options.update(  
+                {x_shape0, x_shape2, x_shape3, x_shape1},
+                {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+            cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+                LayoutSrc().packed({options.input_size}));
+            cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
+                LayoutFilter().packed(options.filter_size));
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
+            cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+            int split_k_slices = 1;    
+            typename Convolution::Arguments arguments{
+                    {options.input_size, options.filter_size, options.padding,
+                    options.conv_stride, options.dilation, options.output_size(), mode,
+                    split_k_slices, options.filter_size.n()},
+                    d_src,      // tensor_src.device_ref(),
+                    d_filter,   // tensor_filter.device_ref(),
+                    d_bias,       // tensor_bias.device_ref(),
+                    d_z,       // tensor_z.device_ref(),
+                    d_dst,      // tensor_dst.device_ref(),
+                    {options.alpha, 0, options.beta}};
+            Convolution conv_op;  
+
+            size_t workspace_size = conv_op.get_workspace_size(arguments); 
+
+            // Allocate workspace memory  
+            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+            CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+            // auto temp1 = exe.alloc_temp(workspace_size);
+            // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+            {
+                //static SimpleProfiler _("aa"); 
+                //SimpleProfilerGuard __(_);         
+
+            //ccccc   c 
+            CUTLASS_CHECK(conv_op());       
+            // kernel<<<1,1>>>();      
+            }
+        '''
+        output = jt.zeros(x.shape)
+        output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return output
+    
+    def grad(self, g):
+        return depthwise_src_backward(g, self.weights), depthwise_filter_backward(self.x, self.weights, g)
\ No newline at end of file
diff --git a/python/jittor/nn.py b/python/jittor/nn.py
index 3a921b29..852d4f96 100644
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@@ -18,6 +18,7 @@
 import numpy as np
 import collections
 import math
+import os
 from collections import OrderedDict
 from jittor.pool import *
 from jittor.optim import *
@@ -756,7 +757,10 @@ def group_norm(x,
 Softmax = jt.make_module(softmax, 2)
 GELU = jt.make_module(gelu)
 
-from jittor.depthwise_conv import DepthwiseConv
+if os.environ.get('use_cutlass') == '0':
+    from jittor.depthwise_conv import DepthwiseConv
+else:
+    from jittor.cutlass_depthwise_conv import DepthwiseConv
 
 class Conv(Module):
     ''' Applies a 2D convolution over an input signal composed of several input planes.
diff --git a/python/jittor/src/opt/tuner/matmul_tuner.cc b/python/jittor/src/opt/tuner/matmul_tuner.cc
index 7c2d8386..43e47e8a 100644
--- a/python/jittor/src/opt/tuner/matmul_tuner.cc
+++ b/python/jittor/src/opt/tuner/matmul_tuner.cc
@@ -87,7 +87,7 @@ void MatmulTuner::run(PassManager* pm, TunerManager* tm) {
             if (xx->dtype().dsize() != 4) continue;
 
         string relay_matmul_name = fop->flags.get(NodeFlags::_cpu) ?
-            "mkl_matmul" : "cutlass_matmul";
+            "mkl_matmul" : "cublas_matmul";
         if (!has_op(relay_matmul_name))
             return;
         auto make_matmul = get_op_info(relay_matmul_name)

From aae2feb608af6e0966eef73dfb5a88adafe96398 Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Mon, 11 Apr 2022 00:05:25 -0400
Subject: [PATCH 3/7] add cutlass fully fused MLP support.

---
 python/jittor/cutlass_depthwise_conv.py |  771 -----------
 python/jittor/cutlass_ops.py            | 1595 +++++++++++++++++++++++
 python/jittor/nn.py                     |   29 +-
 3 files changed, 1622 insertions(+), 773 deletions(-)
 delete mode 100644 python/jittor/cutlass_depthwise_conv.py
 create mode 100644 python/jittor/cutlass_ops.py

diff --git a/python/jittor/cutlass_depthwise_conv.py b/python/jittor/cutlass_depthwise_conv.py
deleted file mode 100644
index b3a29be2..00000000
--- a/python/jittor/cutlass_depthwise_conv.py
+++ /dev/null
@@ -1,771 +0,0 @@
-import jittor as jt
-import numpy as np
-from jittor import nn
-import time
-import sys
-import os
-
-cutlass_path = os.environ.get('cutlass_include_path')
-def depthwise_src_backward(x, weights):
-    cuda_header = '''
-    #undef out
-    #include <cutlass/cutlass.h>
-
-    #include <cutlass/numeric_types.h>
-    #include <cutlass/gemm/device/gemm.h>
-
-    #include <cutlass/util/host_tensor.h>
-
-    #include <cutlass/gemm/device/gemm.h>
-    #include <cutlass/convolution/device/convolution.h>
-
-    #include <cutlass/util/command_line.h>
-    #include <cutlass/util/host_tensor.h>
-    #include <cutlass/util/tensor_view_io.h>
-    #include <cutlass/util/reference/device/gemm.h>
-    #include <cutlass/util/reference/host/tensor_compare.h>
-    #include <cutlass/util/reference/host/tensor_copy.h>
-    #include <cutlass/util/reference/host/tensor_fill.h>
-    #include <cutlass/util/reference/host/convolution.h>
-    #include <cutlass/util/tensor_view_io.h>
-
-    #include "executor.h"
-
-    #include <iostream>
-
-
-    // The code section below describes datatype for input, output tensors and
-    // computation between elements 
-    using ElementAccumulator = float;  // Data type of accumulator
-    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
-    using ElementSrc = float;     // Data type of elements in src tensor
-    using ElementFilter = float;  // Data type of elements in filter tensor
-    using ElementDst = float;     // Data type of elements in output tensor
-
-    using LayoutSrc = cutlass::layout::TensorNCHW;
-    using LayoutFilter = cutlass::layout::TensorNCHW;
-    using LayoutDst = cutlass::layout::TensorNCHW;
-
-    // This code section describes whether you want to use tensor cores or regular
-    // SIMT cores on GPU SM
-    using MMAOp = cutlass::arch::OpClassSimt;
-
-    // This code section describes CUDA SM architecture number
-    using SmArch = cutlass::arch::Sm75; 
-
-    // This code section describes the tile size a thread block will compute
-    using ThreadblockShape =
-            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
-
-    // This code section describes tile size a warp will compute
-    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
-
-    // This code section describes the size of MMA op
-    using InstructionShape =
-            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape 
-
-    // This code section describes how threadblocks are scheduled on GPU
-    using SwizzleThreadBlock =
-            cutlass::conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle;
-
-    // Number of pipelines you want to use
-    constexpr int NumStages = 2;
-
-    // This code section describes the epilogue part of the kernel, we use default
-    // value
-    using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
-            ElementDst,               // Data type of output matrix.
-            1, ElementAccumulator,    // Data type of accumulator
-            ElementDst,               // Data type of bias
-            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
-                                    // combination
-    using Convolution = cutlass::conv::device::Deconvolution<
-            ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
-            LayoutDst, ElementDst, LayoutDst, ElementDst,
-            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
-            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
-            SwizzleThreadBlock, NumStages, 1, 1,
-            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
-            cutlass::conv::ImplicitGemmMode::GEMM_TN>;
-
-    struct Options {
-        bool help;
-        cutlass::Tensor4DCoord input_size;
-        cutlass::Tensor4DCoord filter_size;
-        cutlass::Tensor4DCoord padding;
-        cutlass::MatrixCoord conv_stride;
-        cutlass::MatrixCoord dilation;
-        bool reference_check;
-        bool measure_performance;
-        int iterations;
-        bool save_workspace;
-        ElementComputeEpilogue alpha;
-        ElementComputeEpilogue beta;
-        bool benchmark;
-        std::string tag;
-
-        Options()
-                : help(false),
-                input_size(1, 32, 32, 32),
-                filter_size(32, 3, 3, 1),
-                padding(1, 1, 1, 1),
-                conv_stride(1, 1),
-                dilation(1, 1),
-                reference_check(false),
-                measure_performance(true),
-                iterations(1000),
-                save_workspace(false),
-                alpha(1),
-                beta(0),
-                benchmark(false) {}
-
-            // Verify the problem size is compatible with the CUTLASS Convolution
-        // implementation.
-        bool valid() {
-            int const kAlignment = 1;
-
-            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
-                // misaligned tensors
-                return false;
-            }
-
-            // Invalid padding
-            if ((padding.h() != filter_size.h() / 2) ||
-                (padding.w() != filter_size.w() / 2)) {
-                return false;
-            }
-
-            return true;
-        }
-
-        /// Updates input and filter sizes
-        void update(cutlass::Tensor4DCoord input_size,
-                    cutlass::Tensor4DCoord filter_size) {
-            this->input_size = input_size;
-            this->filter_size = filter_size;
-
-            padding.n() = filter_size.h() / 2;
-            padding.h() = filter_size.h() / 2;
-            padding.w() = filter_size.w() / 2;
-            padding.c() = filter_size.w() / 2;
-        }
-
-        /// Computes the output tensor size (NPQK)
-        cutlass::Tensor4DCoord output_size() const {
-            return cutlass::Tensor4DCoord(
-                    input_size.n(),
-                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
-                                    conv_stride.row() +
-                            1,
-                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
-                                    conv_stride.column() +
-                            1,
-                    filter_size.n());
-        }
-
-        /// Compute performance in GFLOP/s
-        double gflops(double runtime_s) const {
-            // Number of multiply-adds = NPQK * CRS / K
-            int64_t fmas =
-                    output_size().product() *
-                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
-                    output_size().c();
-
-            // Two flops per multiply-add
-            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
-        }
-
-    };
-
-    #define CUTLASS_CHECK(status)                                                 \
-        {                                                                         \
-            cutlass::Status error = status;                                       \
-            if (error != cutlass::Status::kSuccess) {                             \
-                std::cerr << "Got cutlass error: "                                \
-                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
-                        << std::endl;                                           \
-                exit(EXIT_FAILURE);                                               \
-            }                                                                     \
-        }
-
-    #define CUDA_CHECK(status)                                                    \
-        {                                                                         \
-            cudaError_t error = status;                                           \
-            if (error != cudaSuccess) {                                           \
-                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                        << " at line: " << __LINE__ << std::endl;               \
-                exit(EXIT_FAILURE);                                               \
-            }                                                                     \
-        }
-
-    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
-    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-    '''
-
-    cuda_src = '''
-    @alias(weights, in1)      
-    @alias(x, in0)
-    @alias(dst, out0)
-    bool notSupported = false;
-    Options options = Options(); 
-    options.update(  
-        {x_shape0, x_shape2, x_shape3, x_shape1},
-        {weights_shape0, weights_shape2, weights_shape3, 1}); 
-
-    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
-        LayoutSrc().packed({options.input_size}));
-    cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
-        LayoutFilter().packed(options.filter_size));
-    cutlass::TensorRef<typename Convolution::ElementDst,
-                    typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
-    cutlass::TensorRef<typename Convolution::ElementDst,
-                    typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
-    cutlass::TensorRef<typename Convolution::ElementDst,
-                    typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
-
-    int split_k_slices = 1;    
-    typename Convolution::Arguments arguments{
-            {options.input_size, options.filter_size, options.padding,
-            options.conv_stride, options.dilation, options.output_size(), mode,
-            split_k_slices, options.filter_size.n()},
-            d_src,      // tensor_src.device_ref(),
-            d_filter,   // tensor_filter.device_ref(),
-            d_bias,       // tensor_bias.device_ref(),
-            d_z,       // tensor_z.device_ref(),
-            d_dst,      // tensor_dst.device_ref(),
-            {options.alpha, 0, options.beta}};
-
-    Convolution conv_op;  
-
-    size_t workspace_size = conv_op.get_workspace_size(arguments);
-
-    // Allocate workspace memory  
-    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
-    // auto temp1 = exe.alloc_temp(workspace_size);
-    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
-    {
-        //static SimpleProfiler _("aa");
-        //SimpleProfilerGuard __(_);        
-
-    //ccccc   c 
-    CUTLASS_CHECK(conv_op());      
-    // kernel<<<1,1>>>();      
-    }
-    '''
-    output = jt.zeros(x.shape)
-    output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
-    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
-    return output
-
-def depthwise_filter_backward(x, weights, diff):
-    cuda_header = '''
-    #undef out
-    #include <cutlass/cutlass.h>
-    #include <cutlass/numeric_types.h>
-    #include <cutlass/gemm/device/gemm.h>
-    #include <cutlass/util/host_tensor.h>
-    #include <cutlass/gemm/device/gemm.h>
-    #include <cutlass/convolution/device/convolution.h>
-    #include <cutlass/util/command_line.h>
-    #include <cutlass/util/host_tensor.h>
-    #include <cutlass/util/tensor_view_io.h>
-    #include <cutlass/util/reference/device/gemm.h>
-    #include <cutlass/util/reference/host/tensor_compare.h>
-    #include <cutlass/util/reference/host/tensor_copy.h>
-    #include <cutlass/util/reference/host/tensor_fill.h>
-    #include <cutlass/util/reference/host/convolution.h>
-    #include <cutlass/util/tensor_view_io.h>
-
-    #include "executor.h"
-
-    #include <iostream>
-
-    // The code section below describes datatype for input, output tensors and
-    // computation between elements
-    using ElementAccumulator = float;  // Data type of accumulator
-    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
-    using ElementSrc = float;     // Data type of elements in src tensor
-    using ElementFilter = float;  // Data type of elements in filter tensor
-    using ElementDst = float;     // Data type of elements in output tensor
-
-    using LayoutSrc = cutlass::layout::TensorNCHW;
-    using LayoutFilter = cutlass::layout::TensorNCHW;
-    using LayoutDst = cutlass::layout::TensorNCHW;
-    using LayoutGrad = cutlass::layout::TensorNCHW;
-
-    // This code section describes whether you want to use tensor cores or regular
-    // SIMT cores on GPU SM
-    using MMAOp = cutlass::arch::OpClassSimt;
-
-    // This code section describes CUDA SM architecture number
-    using SmArch = cutlass::arch::Sm75;
-
-    // This code section describes the tile size a thread block will compute
-    using ThreadblockShape =
-            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
-
-    // This code section describes tile size a warp will compute 
-    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape 
-
-    // This code section describes the size of MMA op
-    using InstructionShape =
-            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
-
-    // This code section describes how threadblocks are scheduled on GPU
-    using SwizzleThreadBlock =
-            cutlass::conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle;
-
-    // Number of pipelines you want to use
-    constexpr int NumStages = 2;
-
-    // This code section describes the epilogue part of the kernel, we use default
-    // value
-    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
-            ElementFilter,               // Data type of output matrix.
-            1, ElementAccumulator,    // Data type of accumulator
-            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
-                                    // combination
-    using Convolution = cutlass::conv::device::ConvolutionBackwardFilter<
-            ElementSrc, LayoutSrc, ElementDst, LayoutDst, ElementFilter,
-            LayoutFilter, ElementFilter,
-            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
-            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
-            SwizzleThreadBlock, NumStages, 1, 1,
-            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
-            cutlass::conv::ImplicitGemmMode::GEMM_NT>;
-
-    struct Options {
-        bool help;
-        cutlass::Tensor4DCoord input_size;
-        cutlass::Tensor4DCoord filter_size;
-        cutlass::Tensor4DCoord padding;
-        cutlass::MatrixCoord conv_stride;
-        cutlass::MatrixCoord dilation;
-        bool reference_check;
-        bool measure_performance;
-        int iterations;
-        bool save_workspace;
-        ElementComputeEpilogue alpha;
-        ElementComputeEpilogue beta;
-        bool benchmark;
-        std::string tag;
-
-        Options()
-                : help(false),
-                input_size(1, 32, 32, 32),
-                filter_size(32, 3, 3, 1),
-                padding(1, 1, 1, 1),
-                conv_stride(1, 1),
-                dilation(1, 1),
-                reference_check(false),
-                measure_performance(true),
-                iterations(1000),
-                save_workspace(false),
-                alpha(1),
-                beta(0),
-                benchmark(false) {}
-
-            // Verify the problem size is compatible with the CUTLASS Convolution
-        // implementation.
-        bool valid() {
-            int const kAlignment = 1;
-
-            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
-                // misaligned tensors
-                return false;
-            }
-
-            // Invalid padding
-            if ((padding.h() != filter_size.h() / 2) ||
-                (padding.w() != filter_size.w() / 2)) {
-                return false;
-            }
-
-            return true;
-        }
-
-        /// Updates input and filter sizes
-        void update(cutlass::Tensor4DCoord input_size,
-                    cutlass::Tensor4DCoord filter_size) {
-            this->input_size = input_size;
-            this->filter_size = filter_size;
-
-            padding.n() = filter_size.h() / 2;
-            padding.h() = filter_size.h() / 2;
-            padding.w() = filter_size.w() / 2;
-            padding.c() = filter_size.w() / 2;
-        }
-
-        /// Computes the output tensor size (NPQK)
-        cutlass::Tensor4DCoord output_size() const {
-            return cutlass::Tensor4DCoord(
-                    input_size.n(),
-                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
-                                    conv_stride.row() +
-                            1,
-                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
-                                    conv_stride.column() +
-                            1,
-                    filter_size.n());
-        }
-
-        /// Compute performance in GFLOP/s
-        double gflops(double runtime_s) const {
-            // Number of multiply-adds = NPQK * CRS / K
-            int64_t fmas =
-                    output_size().product() *
-                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
-                    output_size().c();
-
-            // Two flops per multiply-add
-            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
-        }
-
-    };
-
-    #define CUTLASS_CHECK(status)                                                 \
-        {                                                                         \
-            cutlass::Status error = status;                                       \
-            if (error != cutlass::Status::kSuccess) {                             \
-                std::cerr << "Got cutlass error: "                                \
-                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
-                        << std::endl;                                           \
-                exit(EXIT_FAILURE);                                               \
-            }                                                                     \
-        }
-
-    #define CUDA_CHECK(status)                                                    \
-        {                                                                         \
-            cudaError_t error = status;                                           \
-            if (error != cudaSuccess) {                                           \
-                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                        << " at line: " << __LINE__ << std::endl;               \
-                exit(EXIT_FAILURE);                                               \
-            }                                                                     \
-        }
-
-    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
-    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-    '''
-
-    cuda_src = '''
-    @alias(grad, in2)
-    @alias(weights, in1)      
-    @alias(x, in0)
-    @alias(dst, out0)
-    bool notSupported = false;
-    Options options = Options(); 
-    options.update(  
-        {x_shape0, x_shape2, x_shape3, x_shape1},
-        {weights_shape0, weights_shape2, weights_shape3, 1}); 
-
-    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
-        LayoutSrc().packed({options.input_size}));
-    cutlass::TensorRef<ElementDst, LayoutDst> d_diff((ElementFilter*)grad_p,
-        LayoutDst().packed(options.output_size()));
-    cutlass::TensorRef<typename Convolution::ElementGrad,
-                typename Convolution::LayoutGrad> d_filter((ElementFilter*)dst_p, LayoutFilter().packed(options.filter_size));
-
-    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
-
-    int split_k_slices = 1;    
-    typename Convolution::Arguments arguments{
-            {options.input_size, options.filter_size, options.padding,
-            options.conv_stride, options.dilation, options.output_size(), mode,
-            split_k_slices, options.filter_size.n()},
-            d_src,      // tensor_src.device_ref(),
-            d_diff,   // tensor_filter.device_ref(),
-            d_filter,
-            {options.alpha}};
-
-    Convolution conv_op;  
-
-    size_t workspace_size = conv_op.get_workspace_size(arguments);
-
-    // Allocate workspace memory  
-    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
-    // auto temp1 = exe.alloc_temp(workspace_size);
-    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
-    {
-        //static SimpleProfiler _("aa");
-        //SimpleProfilerGuard __(_);        
-
-    //ccccc   c 
-    CUTLASS_CHECK(conv_op());      
-    // kernel<<<1,1>>>();      
-    }
-    '''
-    output = jt.zeros(weights.shape)
-    output = jt.code([x, weights, diff], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
-    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
-    return output
-
-class DepthwiseConv(jt.Function):
-    def __init__(self, stride=1, padding=0, dilation=1):
-        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
-        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
-        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
-        self.x = None
-        self.weights = None
-
-    def execute(self, x, weights):
-        self.x = x
-        self.weights = weights
-        cuda_header = '''
-        #undef out
-        #include <iostream> 
-        #include <sstream> 
-        #include <device_launch_parameters.h>
-        #include <cutlass/cutlass.h>
-
-        #include <cutlass/numeric_types.h>
-        #include <cutlass/gemm/device/gemm.h>
-
-        #include <cutlass/util/host_tensor.h>
-
-        #include <cutlass/gemm/device/gemm.h>
-        #include <cutlass/convolution/device/convolution.h>
-
-        #include <cutlass/util/command_line.h>
-        #include <cutlass/util/host_tensor.h>
-        #include <cutlass/util/tensor_view_io.h>
-        #include <cutlass/util/reference/device/gemm.h>
-        #include <cutlass/util/reference/host/tensor_compare.h>
-        #include <cutlass/util/reference/host/tensor_copy.h>
-        #include <cutlass/util/reference/host/tensor_fill.h>
-        #include <cutlass/util/reference/host/convolution.h>
-        #include <cutlass/util/tensor_view_io.h>
-        #include "executor.h"
-
-        // The code section below describes datatype for input, output tensors and
-        // computation between elements
-        using ElementAccumulator = float;  // Data type of accumulator
-        using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
-        using ElementSrc = float;     // Data type of elements in src tensor
-        using ElementFilter = float;  // Data type of elements in filter tensor
-        using ElementDst = float;     // Data type of elements in output tensor
-
-        using LayoutSrc = cutlass::layout::TensorNCHW;
-        using LayoutFilter = cutlass::layout::TensorNCHW;
-        using LayoutDst = cutlass::layout::TensorNCHW;
-
-        // This code section describes whether you want to use tensor cores or regular
-        // SIMT cores on GPU SM
-        using MMAOp = cutlass::arch::OpClassSimt;
-
-        // This code section describes CUDA SM architecture number
-        using SmArch = cutlass::arch::Sm75;
-
-        // This code section describes the tile size a thread block will compute
-        using ThreadblockShape = 
-                cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
-
-        // This code section describes tile size a warp will compute   
-        using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
-
-        // This code section describes the size of MMA op 
-        using InstructionShape =   
-                cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
-
-        // This code section describes how threadblocks are scheduled on GPU
-        using SwizzleThreadBlock =
-                cutlass::conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle;
-            
-        // Number of pipelines you want to use
-        constexpr int NumStages = 1; 
-
-        // This code section describes the epilogue part of the kernel, we use default   
-        // value 
-        using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
-                ElementDst,               // Data type of output matrix.
-                1, ElementAccumulator,    // Data type of accumulator
-                ElementDst,               // Data type of bias
-                ElementComputeEpilogue>;  // Data type for alpha/beta in linear
-                                        // combination
-        using Convolution = cutlass::conv::device::Convolution< 
-                ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
-                LayoutDst, ElementDst, LayoutDst, ElementDst,
-                cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
-                ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
-                SwizzleThreadBlock, NumStages, 1, 1,
-                cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,  
-                cutlass::conv::ImplicitGemmMode::GEMM_TN>; 
-
-        struct Options {
-            bool help; 
-            cutlass::Tensor4DCoord input_size;   
-            cutlass::Tensor4DCoord filter_size;
-            cutlass::Tensor4DCoord padding;
-            cutlass::MatrixCoord conv_stride;
-            cutlass::MatrixCoord dilation;
-            bool reference_check;
-            bool measure_performance;
-            int iterations;
-            bool save_workspace; 
-            ElementComputeEpilogue alpha;
-            ElementComputeEpilogue beta;
-            bool benchmark;
-            std::string tag;
-
-            Options()
-                    : help(false),
-                    input_size(1, 32, 32, 32),
-                    filter_size(32, 3, 3, 1),
-                    padding(1, 1, 1, 1),
-                    conv_stride(1, 1),
-                    dilation(1, 1),
-                    reference_check(false),
-                    measure_performance(false),
-                    iterations(1000),
-                    save_workspace(false),
-                    alpha(1),
-                    beta(0),
-                    benchmark(false) {}
-
-                // Verify the problem size is compatible with the CUTLASS Convolution
-            // implementation.
-            bool valid() {  
-                int const kAlignment = 1;   
-
-                if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
-                    // misaligned tensors
-                    return false;
-                }
-
-                // Invalid padding
-                if ((padding.h() != filter_size.h() / 2) ||
-                    (padding.w() != filter_size.w() / 2)) {
-                    return false;
-                }
-
-                return true;
-            }  
-
-            /// Updates   input  and   filter  sizes      
-            void update(cutlass::Tensor4DCoord input_size,
-                        cutlass::Tensor4DCoord filter_size) {
-                this->input_size = input_size;
-                this->filter_size = filter_size;
-
-                padding.n() = filter_size.h() / 2; 
-                padding.h() = filter_size.h() / 2;  
-                padding.w() = filter_size.w() / 2; 
-                padding.c() = filter_size.w() / 2;   
-            }  
-
-            /// Computes the output tensor size (NPQK)    
-            cutlass::Tensor4DCoord output_size() const {
-                return cutlass::Tensor4DCoord(
-                        input_size.n(),
-                        (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
-                                        conv_stride.row() +
-                                1,
-                        (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
-                                        conv_stride.column() +
-                                1,
-                        filter_size.n()); 
-            }
-
-            /// Compute performance in GFLOP/s 
-            double gflops(double runtime_s) const {
-                // Number of multiply-adds = NPQK * CRS / K
-                int64_t fmas =
-                        output_size().product() *
-                        int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
-                        output_size().c();
-
-                // Two flops per multiply-add
-                return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
-            }
-
-        };
-
-        #define CUTLASS_CHECK(status)                                                 \
-            {                                                                         \
-                cutlass::Status error = status;                                       \
-                if (error != cutlass::Status::kSuccess) {                             \
-                    std::cerr << "Got cutlass error: "                                \
-                            << cutlassGetStatusString(error) << " at: " << __LINE__ \
-                            << std::endl;                                           \
-                    exit(EXIT_FAILURE);                                               \
-                }                                                                     \
-            }
-
-        #define CUDA_CHECK(status)                                                    \
-            {                                                                         \
-                cudaError_t error = status;                                           \
-                if (error != cudaSuccess) {                                           \
-                    std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                            << " at line: " << __LINE__ << std::endl;               \
-                    exit(EXIT_FAILURE);                                               \
-                }                                                                     \
-            }
-
-        '''
-        cuda_src = '''
-            // __global__ void kernel() {}
-            @alias(weights, in1)      
-            @alias(x, in0)
-            @alias(dst, out0)
-            bool notSupported = false;
-            Options options = Options(); 
-            options.update(  
-                {x_shape0, x_shape2, x_shape3, x_shape1},
-                {weights_shape0, weights_shape2, weights_shape3, 1}); 
-
-            cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
-                LayoutSrc().packed({options.input_size}));
-            cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
-                LayoutFilter().packed(options.filter_size));
-            cutlass::TensorRef<typename Convolution::ElementDst,
-                            typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
-            cutlass::TensorRef<typename Convolution::ElementDst,
-                            typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
-            cutlass::TensorRef<typename Convolution::ElementDst,
-                            typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
-            cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
-
-            int split_k_slices = 1;    
-            typename Convolution::Arguments arguments{
-                    {options.input_size, options.filter_size, options.padding,
-                    options.conv_stride, options.dilation, options.output_size(), mode,
-                    split_k_slices, options.filter_size.n()},
-                    d_src,      // tensor_src.device_ref(),
-                    d_filter,   // tensor_filter.device_ref(),
-                    d_bias,       // tensor_bias.device_ref(),
-                    d_z,       // tensor_z.device_ref(),
-                    d_dst,      // tensor_dst.device_ref(),
-                    {options.alpha, 0, options.beta}};
-            Convolution conv_op;  
-
-            size_t workspace_size = conv_op.get_workspace_size(arguments); 
-
-            // Allocate workspace memory  
-            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-            CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
-            // auto temp1 = exe.alloc_temp(workspace_size);
-            // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
-            {
-                //static SimpleProfiler _("aa"); 
-                //SimpleProfilerGuard __(_);         
-
-            //ccccc   c 
-            CUTLASS_CHECK(conv_op());       
-            // kernel<<<1,1>>>();      
-            }
-        '''
-        output = jt.zeros(x.shape)
-        output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
-        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
-        return output
-    
-    def grad(self, g):
-        return depthwise_src_backward(g, self.weights), depthwise_filter_backward(self.x, self.weights, g)
\ No newline at end of file
diff --git a/python/jittor/cutlass_ops.py b/python/jittor/cutlass_ops.py
new file mode 100644
index 00000000..a1bf50da
--- /dev/null
+++ b/python/jittor/cutlass_ops.py
@@ -0,0 +1,1595 @@
+import jittor as jt
+import numpy as np
+from jittor import nn
+import time
+import sys
+import os
+
+cutlass_path = os.environ.get('cutlass_include_path')
+def depthwise_src_backward(x, weights):
+    cuda_header = '''
+    #undef out
+    #include <cutlass/cutlass.h>
+
+    #include <cutlass/numeric_types.h>
+    #include <cutlass/gemm/device/gemm.h>
+
+    #include <cutlass/util/host_tensor.h>
+
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/convolution/device/convolution.h>
+
+    #include <cutlass/util/command_line.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/util/tensor_view_io.h>
+    #include <cutlass/util/reference/device/gemm.h>
+    #include <cutlass/util/reference/host/tensor_compare.h>
+    #include <cutlass/util/reference/host/tensor_copy.h>
+    #include <cutlass/util/reference/host/tensor_fill.h>
+    #include <cutlass/util/reference/host/convolution.h>
+    #include <cutlass/util/tensor_view_io.h>
+
+    #include "executor.h"
+
+    #include <iostream>
+
+
+    // The code section below describes datatype for input, output tensors and
+    // computation between elements 
+    using ElementAccumulator = float;  // Data type of accumulator
+    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+    using ElementSrc = float;     // Data type of elements in src tensor
+    using ElementFilter = float;  // Data type of elements in filter tensor
+    using ElementDst = float;     // Data type of elements in output tensor
+
+    using LayoutSrc = cutlass::layout::TensorNCHW;
+    using LayoutFilter = cutlass::layout::TensorNCHW;
+    using LayoutDst = cutlass::layout::TensorNCHW;
+
+    // This code section describes whether you want to use tensor cores or regular
+    // SIMT cores on GPU SM
+    using MMAOp = cutlass::arch::OpClassSimt;
+
+    // This code section describes CUDA SM architecture number
+    using SmArch = cutlass::arch::Sm75; 
+
+    // This code section describes the tile size a thread block will compute
+    using ThreadblockShape =
+            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+    // This code section describes tile size a warp will compute
+    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
+
+    // This code section describes the size of MMA op
+    using InstructionShape =
+            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape 
+
+    // This code section describes how threadblocks are scheduled on GPU
+    using SwizzleThreadBlock =
+            cutlass::conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle;
+
+    // Number of pipelines you want to use
+    constexpr int NumStages = 2;
+
+    // This code section describes the epilogue part of the kernel, we use default
+    // value
+    using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
+            ElementDst,               // Data type of output matrix.
+            1, ElementAccumulator,    // Data type of accumulator
+            ElementDst,               // Data type of bias
+            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                    // combination
+    using Convolution = cutlass::conv::device::Deconvolution<
+            ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
+            LayoutDst, ElementDst, LayoutDst, ElementDst,
+            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+            SwizzleThreadBlock, NumStages, 1, 1,
+            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
+            cutlass::conv::ImplicitGemmMode::GEMM_TN>;
+
+    struct Options {
+        bool help;
+        cutlass::Tensor4DCoord input_size;
+        cutlass::Tensor4DCoord filter_size;
+        cutlass::Tensor4DCoord padding;
+        cutlass::MatrixCoord conv_stride;
+        cutlass::MatrixCoord dilation;
+        bool reference_check;
+        bool measure_performance;
+        int iterations;
+        bool save_workspace;
+        ElementComputeEpilogue alpha;
+        ElementComputeEpilogue beta;
+        bool benchmark;
+        std::string tag;
+
+        Options()
+                : help(false),
+                input_size(1, 32, 32, 32),
+                filter_size(32, 3, 3, 1),
+                padding(1, 1, 1, 1),
+                conv_stride(1, 1),
+                dilation(1, 1),
+                reference_check(false),
+                measure_performance(true),
+                iterations(1000),
+                save_workspace(false),
+                alpha(1),
+                beta(0),
+                benchmark(false) {}
+
+            // Verify the problem size is compatible with the CUTLASS Convolution
+        // implementation.
+        bool valid() {
+            int const kAlignment = 1;
+
+            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                // misaligned tensors
+                return false;
+            }
+
+            // Invalid padding
+            if ((padding.h() != filter_size.h() / 2) ||
+                (padding.w() != filter_size.w() / 2)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        /// Updates input and filter sizes
+        void update(cutlass::Tensor4DCoord input_size,
+                    cutlass::Tensor4DCoord filter_size) {
+            this->input_size = input_size;
+            this->filter_size = filter_size;
+
+            padding.n() = filter_size.h() / 2;
+            padding.h() = filter_size.h() / 2;
+            padding.w() = filter_size.w() / 2;
+            padding.c() = filter_size.w() / 2;
+        }
+
+        /// Computes the output tensor size (NPQK)
+        cutlass::Tensor4DCoord output_size() const {
+            return cutlass::Tensor4DCoord(
+                    input_size.n(),
+                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                    conv_stride.row() +
+                            1,
+                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                    conv_stride.column() +
+                            1,
+                    filter_size.n());
+        }
+
+        /// Compute performance in GFLOP/s
+        double gflops(double runtime_s) const {
+            // Number of multiply-adds = NPQK * CRS / K
+            int64_t fmas =
+                    output_size().product() *
+                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                    output_size().c();
+
+            // Two flops per multiply-add
+            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+        }
+
+    };
+
+    #define CUTLASS_CHECK(status)                                                 \
+        {                                                                         \
+            cutlass::Status error = status;                                       \
+            if (error != cutlass::Status::kSuccess) {                             \
+                std::cerr << "Got cutlass error: "                                \
+                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                        << std::endl;                                           \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CUDA_CHECK(status)                                                    \
+        {                                                                         \
+            cudaError_t error = status;                                           \
+            if (error != cudaSuccess) {                                           \
+                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                        << " at line: " << __LINE__ << std::endl;               \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+    '''
+
+    cuda_src = '''
+    @alias(weights, in1)      
+    @alias(x, in0)
+    @alias(dst, out0)
+    bool notSupported = false;
+    Options options = Options(); 
+    options.update(  
+        {x_shape0, x_shape2, x_shape3, x_shape1},
+        {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+        LayoutSrc().packed({options.input_size}));
+    cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
+        LayoutFilter().packed(options.filter_size));
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
+    cutlass::TensorRef<typename Convolution::ElementDst,
+                    typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    int split_k_slices = 1;    
+    typename Convolution::Arguments arguments{
+            {options.input_size, options.filter_size, options.padding,
+            options.conv_stride, options.dilation, options.output_size(), mode,
+            split_k_slices, options.filter_size.n()},
+            d_src,      // tensor_src.device_ref(),
+            d_filter,   // tensor_filter.device_ref(),
+            d_bias,       // tensor_bias.device_ref(),
+            d_z,       // tensor_z.device_ref(),
+            d_dst,      // tensor_dst.device_ref(),
+            {options.alpha, 0, options.beta}};
+
+    Convolution conv_op;  
+
+    size_t workspace_size = conv_op.get_workspace_size(arguments);
+
+    // Allocate workspace memory  
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+    // auto temp1 = exe.alloc_temp(workspace_size);
+    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+    {
+        //static SimpleProfiler _("aa");
+        //SimpleProfilerGuard __(_);        
+
+    //ccccc   c 
+    CUTLASS_CHECK(conv_op());      
+    // kernel<<<1,1>>>();      
+    }
+    '''
+    output = jt.zeros(x.shape)
+    output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+    return output
+
+def depthwise_filter_backward(x, weights, diff):
+    cuda_header = '''
+    #undef out
+    #include <cutlass/cutlass.h>
+    #include <cutlass/numeric_types.h>
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/gemm/device/gemm.h>
+    #include <cutlass/convolution/device/convolution.h>
+    #include <cutlass/util/command_line.h>
+    #include <cutlass/util/host_tensor.h>
+    #include <cutlass/util/tensor_view_io.h>
+    #include <cutlass/util/reference/device/gemm.h>
+    #include <cutlass/util/reference/host/tensor_compare.h>
+    #include <cutlass/util/reference/host/tensor_copy.h>
+    #include <cutlass/util/reference/host/tensor_fill.h>
+    #include <cutlass/util/reference/host/convolution.h>
+    #include <cutlass/util/tensor_view_io.h>
+
+    #include "executor.h"
+
+    #include <iostream>
+
+    // The code section below describes datatype for input, output tensors and
+    // computation between elements
+    using ElementAccumulator = float;  // Data type of accumulator
+    using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+    using ElementSrc = float;     // Data type of elements in src tensor
+    using ElementFilter = float;  // Data type of elements in filter tensor
+    using ElementDst = float;     // Data type of elements in output tensor
+
+    using LayoutSrc = cutlass::layout::TensorNCHW;
+    using LayoutFilter = cutlass::layout::TensorNCHW;
+    using LayoutDst = cutlass::layout::TensorNCHW;
+    using LayoutGrad = cutlass::layout::TensorNCHW;
+
+    // This code section describes whether you want to use tensor cores or regular
+    // SIMT cores on GPU SM
+    using MMAOp = cutlass::arch::OpClassSimt;
+
+    // This code section describes CUDA SM architecture number
+    using SmArch = cutlass::arch::Sm75;
+
+    // This code section describes the tile size a thread block will compute
+    using ThreadblockShape =
+            cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+    // This code section describes tile size a warp will compute 
+    using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape 
+
+    // This code section describes the size of MMA op
+    using InstructionShape =
+            cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
+
+    // This code section describes how threadblocks are scheduled on GPU
+    using SwizzleThreadBlock =
+            cutlass::conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle;
+
+    // Number of pipelines you want to use
+    constexpr int NumStages = 2;
+
+    // This code section describes the epilogue part of the kernel, we use default
+    // value
+    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+            ElementFilter,               // Data type of output matrix.
+            1, ElementAccumulator,    // Data type of accumulator
+            ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                    // combination
+    using Convolution = cutlass::conv::device::ConvolutionBackwardFilter<
+            ElementSrc, LayoutSrc, ElementDst, LayoutDst, ElementFilter,
+            LayoutFilter, ElementFilter,
+            cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+            ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+            SwizzleThreadBlock, NumStages, 1, 1,
+            cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,
+            cutlass::conv::ImplicitGemmMode::GEMM_NT>;
+
+    struct Options {
+        bool help;
+        cutlass::Tensor4DCoord input_size;
+        cutlass::Tensor4DCoord filter_size;
+        cutlass::Tensor4DCoord padding;
+        cutlass::MatrixCoord conv_stride;
+        cutlass::MatrixCoord dilation;
+        bool reference_check;
+        bool measure_performance;
+        int iterations;
+        bool save_workspace;
+        ElementComputeEpilogue alpha;
+        ElementComputeEpilogue beta;
+        bool benchmark;
+        std::string tag;
+
+        Options()
+                : help(false),
+                input_size(1, 32, 32, 32),
+                filter_size(32, 3, 3, 1),
+                padding(1, 1, 1, 1),
+                conv_stride(1, 1),
+                dilation(1, 1),
+                reference_check(false),
+                measure_performance(true),
+                iterations(1000),
+                save_workspace(false),
+                alpha(1),
+                beta(0),
+                benchmark(false) {}
+
+            // Verify the problem size is compatible with the CUTLASS Convolution
+        // implementation.
+        bool valid() {
+            int const kAlignment = 1;
+
+            if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                // misaligned tensors
+                return false;
+            }
+
+            // Invalid padding
+            if ((padding.h() != filter_size.h() / 2) ||
+                (padding.w() != filter_size.w() / 2)) {
+                return false;
+            }
+
+            return true;
+        }
+
+        /// Updates input and filter sizes
+        void update(cutlass::Tensor4DCoord input_size,
+                    cutlass::Tensor4DCoord filter_size) {
+            this->input_size = input_size;
+            this->filter_size = filter_size;
+
+            padding.n() = filter_size.h() / 2;
+            padding.h() = filter_size.h() / 2;
+            padding.w() = filter_size.w() / 2;
+            padding.c() = filter_size.w() / 2;
+        }
+
+        /// Computes the output tensor size (NPQK)
+        cutlass::Tensor4DCoord output_size() const {
+            return cutlass::Tensor4DCoord(
+                    input_size.n(),
+                    (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                    conv_stride.row() +
+                            1,
+                    (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                    conv_stride.column() +
+                            1,
+                    filter_size.n());
+        }
+
+        /// Compute performance in GFLOP/s
+        double gflops(double runtime_s) const {
+            // Number of multiply-adds = NPQK * CRS / K
+            int64_t fmas =
+                    output_size().product() *
+                    int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                    output_size().c();
+
+            // Two flops per multiply-add
+            return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+        }
+
+    };
+
+    #define CUTLASS_CHECK(status)                                                 \
+        {                                                                         \
+            cutlass::Status error = status;                                       \
+            if (error != cutlass::Status::kSuccess) {                             \
+                std::cerr << "Got cutlass error: "                                \
+                        << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                        << std::endl;                                           \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CUDA_CHECK(status)                                                    \
+        {                                                                         \
+            cudaError_t error = status;                                           \
+            if (error != cudaSuccess) {                                           \
+                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                        << " at line: " << __LINE__ << std::endl;               \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+    #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+    #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+    #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+    '''
+
+    cuda_src = '''
+    @alias(grad, in2)
+    @alias(weights, in1)      
+    @alias(x, in0)
+    @alias(dst, out0)
+    bool notSupported = false;
+    Options options = Options(); 
+    options.update(  
+        {x_shape0, x_shape2, x_shape3, x_shape1},
+        {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+    cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+        LayoutSrc().packed({options.input_size}));
+    cutlass::TensorRef<ElementDst, LayoutDst> d_diff((ElementFilter*)grad_p,
+        LayoutDst().packed(options.output_size()));
+    cutlass::TensorRef<typename Convolution::ElementGrad,
+                typename Convolution::LayoutGrad> d_filter((ElementFilter*)dst_p, LayoutFilter().packed(options.filter_size));
+
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+    int split_k_slices = 1;    
+    typename Convolution::Arguments arguments{
+            {options.input_size, options.filter_size, options.padding,
+            options.conv_stride, options.dilation, options.output_size(), mode,
+            split_k_slices, options.filter_size.n()},
+            d_src,      // tensor_src.device_ref(),
+            d_diff,   // tensor_filter.device_ref(),
+            d_filter,
+            {options.alpha}};
+
+    Convolution conv_op;  
+
+    size_t workspace_size = conv_op.get_workspace_size(arguments);
+
+    // Allocate workspace memory  
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+    // auto temp1 = exe.alloc_temp(workspace_size);
+    // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+    {
+        //static SimpleProfiler _("aa");
+        //SimpleProfilerGuard __(_);        
+
+    //ccccc   c 
+    CUTLASS_CHECK(conv_op());      
+    // kernel<<<1,1>>>();      
+    }
+    '''
+    output = jt.zeros(weights.shape)
+    output = jt.code([x, weights, diff], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+    output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+    return output
+
+class DepthwiseConv(jt.Function):
+    def __init__(self, stride=1, padding=0, dilation=1):
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+        self.x = None
+        self.weights = None
+
+    def execute(self, x, weights):
+        self.x = x
+        self.weights = weights
+        cuda_header = '''
+        #undef out
+        #include <iostream> 
+        #include <sstream> 
+        #include <device_launch_parameters.h>
+        #include <cutlass/cutlass.h>
+
+        #include <cutlass/numeric_types.h>
+        #include <cutlass/gemm/device/gemm.h>
+
+        #include <cutlass/util/host_tensor.h>
+
+        #include <cutlass/gemm/device/gemm.h>
+        #include <cutlass/convolution/device/convolution.h>
+
+        #include <cutlass/util/command_line.h>
+        #include <cutlass/util/host_tensor.h>
+        #include <cutlass/util/tensor_view_io.h>
+        #include <cutlass/util/reference/device/gemm.h>
+        #include <cutlass/util/reference/host/tensor_compare.h>
+        #include <cutlass/util/reference/host/tensor_copy.h>
+        #include <cutlass/util/reference/host/tensor_fill.h>
+        #include <cutlass/util/reference/host/convolution.h>
+        #include <cutlass/util/tensor_view_io.h>
+        #include "executor.h"
+
+        // The code section below describes datatype for input, output tensors and
+        // computation between elements
+        using ElementAccumulator = float;  // Data type of accumulator
+        using ElementComputeEpilogue = float;                // Data type of epilogue computation (alpha, beta)
+        using ElementSrc = float;     // Data type of elements in src tensor
+        using ElementFilter = float;  // Data type of elements in filter tensor
+        using ElementDst = float;     // Data type of elements in output tensor
+
+        using LayoutSrc = cutlass::layout::TensorNCHW;
+        using LayoutFilter = cutlass::layout::TensorNCHW;
+        using LayoutDst = cutlass::layout::TensorNCHW;
+
+        // This code section describes whether you want to use tensor cores or regular
+        // SIMT cores on GPU SM
+        using MMAOp = cutlass::arch::OpClassSimt;
+
+        // This code section describes CUDA SM architecture number
+        using SmArch = cutlass::arch::Sm75;
+
+        // This code section describes the tile size a thread block will compute
+        using ThreadblockShape = 
+                cutlass::gemm::GemmShape<32, 32, 8>;  // Threadblock tile shape
+
+        // This code section describes tile size a warp will compute   
+        using WarpShape = cutlass::gemm::GemmShape<8, 16, 8>;  // Warp tile shape
+
+        // This code section describes the size of MMA op 
+        using InstructionShape =   
+                cutlass::gemm::GemmShape<1, 1, 1>;  // TensorCore instruction shape
+
+        // This code section describes how threadblocks are scheduled on GPU
+        using SwizzleThreadBlock =
+                cutlass::conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle;
+            
+        // Number of pipelines you want to use
+        constexpr int NumStages = 1; 
+
+        // This code section describes the epilogue part of the kernel, we use default   
+        // value 
+        using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
+                ElementDst,               // Data type of output matrix.
+                1, ElementAccumulator,    // Data type of accumulator
+                ElementDst,               // Data type of bias
+                ElementComputeEpilogue>;  // Data type for alpha/beta in linear
+                                        // combination
+        using Convolution = cutlass::conv::device::Convolution< 
+                ElementSrc, LayoutSrc, ElementFilter, LayoutFilter, ElementDst,
+                LayoutDst, ElementDst, LayoutDst, ElementDst,
+                cutlass::conv::ConvType::kDepthwiseConvolution, MMAOp, SmArch,
+                ThreadblockShape, WarpShape, InstructionShape, EpilogueOp,
+                SwizzleThreadBlock, NumStages, 1, 1,
+                cutlass::conv::SpecialOptimizeDesc::NONE, cutlass::arch::OpMultiplyAdd,  
+                cutlass::conv::ImplicitGemmMode::GEMM_TN>; 
+
+        struct Options {
+            bool help; 
+            cutlass::Tensor4DCoord input_size;   
+            cutlass::Tensor4DCoord filter_size;
+            cutlass::Tensor4DCoord padding;
+            cutlass::MatrixCoord conv_stride;
+            cutlass::MatrixCoord dilation;
+            bool reference_check;
+            bool measure_performance;
+            int iterations;
+            bool save_workspace; 
+            ElementComputeEpilogue alpha;
+            ElementComputeEpilogue beta;
+            bool benchmark;
+            std::string tag;
+
+            Options()
+                    : help(false),
+                    input_size(1, 32, 32, 32),
+                    filter_size(32, 3, 3, 1),
+                    padding(1, 1, 1, 1),
+                    conv_stride(1, 1),
+                    dilation(1, 1),
+                    reference_check(false),
+                    measure_performance(false),
+                    iterations(1000),
+                    save_workspace(false),
+                    alpha(1),
+                    beta(0),
+                    benchmark(false) {}
+
+                // Verify the problem size is compatible with the CUTLASS Convolution
+            // implementation.
+            bool valid() {  
+                int const kAlignment = 1;   
+
+                if ((input_size.c() % kAlignment) || (filter_size.n() % kAlignment)) {
+                    // misaligned tensors
+                    return false;
+                }
+
+                // Invalid padding
+                if ((padding.h() != filter_size.h() / 2) ||
+                    (padding.w() != filter_size.w() / 2)) {
+                    return false;
+                }
+
+                return true;
+            }  
+
+            /// Updates   input  and   filter  sizes      
+            void update(cutlass::Tensor4DCoord input_size,
+                        cutlass::Tensor4DCoord filter_size) {
+                this->input_size = input_size;
+                this->filter_size = filter_size;
+
+                padding.n() = filter_size.h() / 2; 
+                padding.h() = filter_size.h() / 2;  
+                padding.w() = filter_size.w() / 2; 
+                padding.c() = filter_size.w() / 2;   
+            }  
+
+            /// Computes the output tensor size (NPQK)    
+            cutlass::Tensor4DCoord output_size() const {
+                return cutlass::Tensor4DCoord(
+                        input_size.n(),
+                        (input_size.h() + padding.n() + padding.h() - filter_size.h()) /
+                                        conv_stride.row() +
+                                1,
+                        (input_size.w() + padding.w() + padding.c() - filter_size.w()) /
+                                        conv_stride.column() +
+                                1,
+                        filter_size.n()); 
+            }
+
+            /// Compute performance in GFLOP/s 
+            double gflops(double runtime_s) const {
+                // Number of multiply-adds = NPQK * CRS / K
+                int64_t fmas =
+                        output_size().product() *
+                        int64_t(filter_size.h() * filter_size.w() * filter_size.c()) /
+                        output_size().c();
+
+                // Two flops per multiply-add
+                return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+            }
+
+        };
+
+        #define CUTLASS_CHECK(status)                                                 \
+            {                                                                         \
+                cutlass::Status error = status;                                       \
+                if (error != cutlass::Status::kSuccess) {                             \
+                    std::cerr << "Got cutlass error: "                                \
+                            << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                            << std::endl;                                           \
+                    exit(EXIT_FAILURE);                                               \
+                }                                                                     \
+            }
+
+        #define CUDA_CHECK(status)                                                    \
+            {                                                                         \
+                cudaError_t error = status;                                           \
+                if (error != cudaSuccess) {                                           \
+                    std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                            << " at line: " << __LINE__ << std::endl;               \
+                    exit(EXIT_FAILURE);                                               \
+                }                                                                     \
+            }
+
+        '''
+        cuda_src = '''
+            // __global__ void kernel() {}
+            @alias(weights, in1)      
+            @alias(x, in0)
+            @alias(dst, out0)
+            bool notSupported = false;
+            Options options = Options(); 
+            options.update(  
+                {x_shape0, x_shape2, x_shape3, x_shape1},
+                {weights_shape0, weights_shape2, weights_shape3, 1}); 
+
+            cutlass::TensorRef<ElementSrc, LayoutSrc> d_src((ElementSrc*)x_p, 
+                LayoutSrc().packed({options.input_size}));
+            cutlass::TensorRef<ElementFilter, LayoutFilter> d_filter((ElementFilter*)weights_p,
+                LayoutFilter().packed(options.filter_size));
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_dst((ElementDst*)dst_p, LayoutDst().packed(options.output_size()));
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_bias = {nullptr, Convolution::LayoutDst()};
+            cutlass::TensorRef<typename Convolution::ElementDst,
+                            typename Convolution::LayoutDst> d_z = {nullptr, Convolution::LayoutDst()};
+            cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
+
+            int split_k_slices = 1;    
+            typename Convolution::Arguments arguments{
+                    {options.input_size, options.filter_size, options.padding,
+                    options.conv_stride, options.dilation, options.output_size(), mode,
+                    split_k_slices, options.filter_size.n()},
+                    d_src,      // tensor_src.device_ref(),
+                    d_filter,   // tensor_filter.device_ref(),
+                    d_bias,       // tensor_bias.device_ref(),
+                    d_z,       // tensor_z.device_ref(),
+                    d_dst,      // tensor_dst.device_ref(),
+                    {options.alpha, 0, options.beta}};
+            Convolution conv_op;  
+
+            size_t workspace_size = conv_op.get_workspace_size(arguments); 
+
+            // Allocate workspace memory  
+            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+            CUTLASS_CHECK(conv_op.initialize(arguments, workspace.get()));
+            // auto temp1 = exe.alloc_temp(workspace_size);
+            // CUTLASS_CHECK(conv_op.initialize(arguments, temp1.ptr)); 
+            {
+                //static SimpleProfiler _("aa"); 
+                //SimpleProfilerGuard __(_);         
+
+            //ccccc   c 
+            CUTLASS_CHECK(conv_op());       
+            // kernel<<<1,1>>>();      
+            }
+        '''
+        output = jt.zeros(x.shape)
+        output = jt.code([x, weights], [output], cuda_header=cuda_header, cuda_src=cuda_src)[0]
+        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return output
+    
+    def grad(self, g):
+        return depthwise_src_backward(g, self.weights), depthwise_filter_backward(self.x, self.weights, g)
+
+def backward_header():
+    return '''
+        #pragma once
+        #undef out
+        #include <cutlass/cutlass.h>
+
+        #include <cutlass/array.h>
+        #include <cutlass/functional.h>
+        #include <cutlass/gemm/device/gemm.h>
+        #include <cutlass/gemm/device/gemm_splitk_parallel.h>
+        #include <cutlass/numeric_conversion.h>
+        #include <cutlass/numeric_types.h>
+        #include <cutlass/util/host_tensor.h>
+        #include <cutlass/util/reference/device/gemm.h>
+        #include <iostream>
+        #include <map>
+        #include <type_traits>
+        #include <stdint.h>
+        #include <algorithm>
+
+        // implement temp GPUMatrix and GPUDynamicMatrix here.
+        // #define RM MatrixLayout::kRowMajor
+        // #define CM MatrixLayout::kColumnMajor
+        // enum class MatrixLayout {
+        //   kColumnMajor,
+        //   kRowMajor
+        // };
+
+        enum class Activation {
+            ReLU,
+            Exponential,
+            Sine,
+            Sigmoid,
+            Squareplus,
+            Softplus,
+            None,
+        };
+
+        #define CUTLASS_CHECK(status)                                                                      \
+        {                                                                                                  \
+            cutlass::Status error = status;                                                                \
+            if (error != cutlass::Status::kSuccess) {                                                      \
+                std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                        << std::endl;                                                                    \
+                exit(EXIT_FAILURE);                                                                        \
+            }                                                                                              \
+        }
+
+        #define CUDA_CHECK(status)                                                \
+        {                                                                         \
+            cudaError_t error = status;                                           \
+            if (error != cudaSuccess) {                                           \
+                std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                        << " at line: " << __LINE__ << std::endl;               \
+                exit(EXIT_FAILURE);                                               \
+            }                                                                     \
+        }
+
+        using SmArch = cutlass::arch::Sm80;
+
+        using TypeAccumulator = float;
+        using TypeCompute = float;
+        using ElementComputeEpilogue = TypeAccumulator;  // <- data type of epilogue operations
+        using MMAOp = cutlass::arch::OpClassTensorOp;
+
+        // using ShapeMMAOp = typename std::conditional<
+        // 	std::is_same<MMAOp<T>, cutlass::arch::OpClassTensorOp>::value,
+        // 	typename std::conditional<
+        // 		std::is_same<SmArch, cutlass::arch::Sm80>::value || std::is_same<SmArch, cutlass::arch::Sm75>::value,
+        // 		cutlass::gemm::GemmShape<16, 8, 8>,
+        // 		cutlass::gemm::GemmShape<8, 8, 4>
+        // 	>::type,
+        // 	cutlass::gemm::GemmShape<1, 1, 1>
+        // >::type;
+
+        // using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;
+        template <typename thread_block, typename warp>
+        struct LayerConfig {
+            using k_thread_block = thread_block;
+            using k_warp = warp;
+        };
+
+        using FullLayerK = LayerConfig<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<32, 32, 32>>;
+        using LastLayerK = LayerConfig<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<32, 32, 32>>;
+
+        // using FullLayer = typename std::conditional<
+        // 	std::is_same<MMAOp<network_precision_t>, cutlass::arch::OpClassSimt>::value,
+        // 	LayerConfig<cutlass::gemm::GemmShape<128, 128, 8>, cutlass::gemm::GemmShape<32, 64, 8>>,
+        // 	LayerConfig<cutlass::gemm::GemmShape<128, 128, 32>, cutlass::gemm::GemmShape<64, 64, 32>>
+        // >::type;
+
+        // using FullLayerPreReLU = typename std::conditional<
+        // 	std::is_same<MMAOp<network_precision_t>, cutlass::arch::OpClassSimt>::value,
+        // 	LayerConfig<cutlass::gemm::GemmShape<128, 128, 8, true>, cutlass::gemm::GemmShape<32, 64, 8, true>>,
+        // 	LayerConfig<cutlass::gemm::GemmShape<128, 128, 32, true>, cutlass::gemm::GemmShape<64, 64, 32, true>>
+        // >::type;
+
+        // using LastLayer = typename std::conditional<
+        // 	std::is_same<MMAOp<network_precision_t>, cutlass::arch::OpClassSimt>::value,
+        // 	LayerConfig<cutlass::gemm::GemmShape<128, 128, 8>, cutlass::gemm::GemmShape<32, 64, 8>>,
+        // 	typename std::conditional<
+        // 		std::is_same<SmArch, cutlass::arch::Sm80>::value || std::is_same<SmArch, cutlass::arch::Sm75>::value,
+        // 		LayerConfig<cutlass::gemm::GemmShape<128, 32, 32>, cutlass::gemm::GemmShape<32, 32, 32>>,
+        // 		LayerConfig<cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<32, 32, 32>>
+        // 	>::type
+        // >::type;
+
+        // This code section describes how threadblocks are scheduled on GPU
+        using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+        // warp activation defined here
+        template <typename T, typename fragment_t>
+        __host__ __device__ void warp_activation(Activation activation, const fragment_t& frag, fragment_t& result) {
+            switch (activation) {
+                case Activation::ReLU:
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int t=0; t < result.num_elements; t++) {
+                        result.x[t] = frag.x[t] * (T)((T)frag.x[t] > (T)0.0f);
+                    }
+                    return;
+                case Activation::None: result = frag; return;
+                default:
+                    // Unsupported activation
+                    // assert(false); // Commented out due to isolated strange side-effects on Windows
+                    return;
+            }
+        }
+
+        template <typename T, typename fragment_t>
+        __host__ __device__ fragment_t warp_activation(Activation activation, const fragment_t& frag) {
+            fragment_t result;
+            warp_activation<T>(activation, frag, result);
+            return result;
+        }
+
+
+        template <typename T, typename fragment_t, typename forward_fragment_t>
+        __host__ __device__ void warp_activation_backward(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag, fragment_t& result) {
+            switch (activation) {
+                case Activation::ReLU:
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int t=0; t < result.num_elements; t++) {
+                        result.x[t] = frag.x[t] * (T)(forward_frag.x[t] > (T)0.0f);
+                    }
+                    return;
+                case Activation::None: result = frag; return;
+                default:
+                    // Unsupported activation
+                    // assert(false); // Commented out due to isolated strange side-effects on Windows
+                    return;
+            }
+        }
+
+        template <typename T, typename fragment_t, typename forward_fragment_t>
+        __host__ __device__ fragment_t warp_activation_backward(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag) {
+            fragment_t result;
+            warp_activation_backward<T>(activation, frag, forward_frag, result);
+            return result;
+        }
+
+        // // This code section describes the epilogue part of the kernel
+
+        template <typename V>
+        struct CutlassFragmentWrapper {
+            static const uint32_t num_elements = V::kElements;
+            V x;
+        };
+
+        template <
+            typename ElementOutput_,                             ///< Data type used to load and store tensors
+            int Count,                                           ///< Number of elements computed per operation
+            typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+            typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+            cutlass::FloatRoundStyle Round = cutlass::FloatRoundStyle::round_to_nearest
+        >
+        class ActivationEpilogue {
+        public:
+            using ElementOutput = ElementOutput_;
+            using ElementAccumulator = ElementAccumulator_;
+            using ElementCompute = ElementCompute_;
+
+            static int const kCount = Count;
+
+            using FragmentOutput = cutlass::Array<ElementOutput, kCount>;
+            using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
+            using ComputeFragment = cutlass::Array<ElementCompute, kCount>;
+
+            static cutlass::FloatRoundStyle const kRound = Round;
+
+            struct Params {
+                Activation activation;
+                bool sum_source;
+            };
+
+        public:
+            CUTLASS_HOST_DEVICE
+            ActivationEpilogue(Params const &params) : m_activation{params.activation}, m_sum_source{params.sum_source} { }
+
+            CUTLASS_HOST_DEVICE
+            bool is_source_needed() const {
+                return m_sum_source;
+            }
+
+            /// Functionally required for serial reduction in the epilogue
+            CUTLASS_HOST_DEVICE
+            void set_k_partition(int k_partition, int k_partition_count) { }
+
+            CUTLASS_HOST_DEVICE
+            FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+                cutlass::NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+                auto intermediate = CutlassFragmentWrapper<ComputeFragment>{accumulator_converter(accumulator)};
+                intermediate = warp_activation<ElementCompute>(m_activation, intermediate);
+
+                cutlass::NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+                return destination_converter(intermediate.x);
+            }
+
+            CUTLASS_HOST_DEVICE
+            FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source) const {
+                cutlass::NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+                cutlass::NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+                cutlass::plus<ComputeFragment> plus_op;
+                auto intermediate = CutlassFragmentWrapper<ComputeFragment>{accumulator_converter(accumulator)};
+                if (m_sum_source) {
+                    intermediate.x = plus_op(intermediate.x, source_converter(source));
+                }
+                intermediate = warp_activation<ElementCompute>(m_activation, intermediate);
+
+                cutlass::NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+                return destination_converter(intermediate.x);
+            }
+
+        private:
+            Activation m_activation;
+            bool m_sum_source;
+        };
+
+        template <
+            typename ElementOutput_,                             ///< Data type used to load and store tensors
+            int Count,                                           ///< Number of elements computed per operation
+            typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+            typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+            cutlass::FloatRoundStyle Round = cutlass::FloatRoundStyle::round_to_nearest
+        >
+        class ActivationTransferEpilogue {
+        public:
+            using ElementOutput = ElementOutput_;
+            using ElementAccumulator = ElementAccumulator_;
+            using ElementCompute = ElementCompute_;
+
+            static int const kCount = Count;
+
+            using FragmentOutput = cutlass::Array<ElementOutput, kCount>;
+            using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
+            using ComputeFragment = cutlass::Array<ElementCompute, kCount>;
+
+            static cutlass::FloatRoundStyle const kRound = Round;
+
+            /// Host-constructable parameters structure
+            struct Params {
+                Activation activation;
+            };
+
+        public:
+            /// Constructs the function object, possibly loading from pointers in host memory
+            CUTLASS_HOST_DEVICE
+            ActivationTransferEpilogue(Params const &params) : m_activation{params.activation} { }
+
+            /// Returns true if source is needed
+            CUTLASS_HOST_DEVICE
+            bool is_source_needed() const {
+                return true;
+            }
+
+            /// Functionally required for serial reduction in the epilogue
+            CUTLASS_HOST_DEVICE
+            void set_k_partition(int k_partition, int k_partition_count) { }
+
+            CUTLASS_HOST_DEVICE
+            FragmentOutput operator()(
+                FragmentAccumulator const &accumulator,
+                FragmentOutput const &source) const {
+
+                cutlass::NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+                cutlass::NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+                auto converted_source = CutlassFragmentWrapper<ComputeFragment>{source_converter(source)};
+                auto intermediate = CutlassFragmentWrapper<ComputeFragment>{accumulator_converter(accumulator)};
+
+                intermediate = warp_activation_backward<ElementCompute>(m_activation, intermediate, converted_source);
+
+                cutlass::NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+                return destination_converter(intermediate.x);
+            }
+
+            CUTLASS_HOST_DEVICE
+            FragmentOutput operator()(
+                FragmentAccumulator const &accumulator) const {
+
+                cutlass::NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+                ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+                cutlass::NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+                return destination_converter(converted_accumulator);
+            }
+
+        private:
+            Activation m_activation;
+        };
+
+
+        // template <typename T>
+        // static constexpr int n_vectorized_elements = std::is_same<MMAOp<T>, cutlass::arch::OpClassTensorOp>::value ? (128 / cutlass::sizeof_bits<T>::value) : 1;
+
+        // template <typename T>
+        // using SumOp = cutlass::epilogue::thread::LinearCombination<T, n_vectorized_elements<T>, TypeAccumulator, TypeCompute>;
+
+        // template <typename T>
+        // using IntermediateActivationOp = ActivationEpilogue<T, 4, TypeAccumulator, TypeCompute>;
+
+        // template <typename T>
+        // using IntermediateActivationTransferOp = ActivationTransferEpilogue<T, 4, TypeAccumulator, TypeCompute>;
+
+        // template <typename T>
+        // using ActivationOp = ActivationEpilogue<T, n_vectorized_elements<T>, TypeAccumulator, TypeCompute>;
+
+        // template <typename T>
+        // using ActivationTransferOp = ActivationTransferEpilogue<T, n_vectorized_elements<T>, TypeAccumulator, TypeCompute>;
+
+        using OurGemm = cutlass::gemm::device::Gemm<
+            float,
+            cutlass::layout::RowMajor,
+            float,
+            cutlass::layout::ColumnMajor,
+            float,
+            cutlass::layout::RowMajor,
+            TypeAccumulator,
+            MMAOp,
+            SmArch,
+            cutlass::gemm::GemmShape<128, 128, 32>, 
+            cutlass::gemm::GemmShape<64, 64, 32>,
+            cutlass::gemm::GemmShape<16, 8, 8>,
+            ActivationEpilogue<float, 4, TypeAccumulator, TypeCompute>,
+            SwizzleThreadBlock,
+            2
+        >;
+
+        using OurGemmTransfer = cutlass::gemm::device::Gemm<
+            float,
+            cutlass::layout::RowMajor,
+            float,
+            cutlass::layout::ColumnMajor,
+            float,
+            cutlass::layout::RowMajor,
+            TypeAccumulator,
+            MMAOp,
+            SmArch,
+            cutlass::gemm::GemmShape<128, 128, 32>, 
+            cutlass::gemm::GemmShape<64, 32, 32>,
+            cutlass::gemm::GemmShape<16, 8, 8>,
+            ActivationTransferEpilogue<float, 4, TypeAccumulator, TypeCompute>,
+            SwizzleThreadBlock,
+            2
+        >;
+
+        // using epi = cutlass::epilogue::thread::LinearCombination<float, 4, TypeAccumulator, TypeCompute>;
+        // using SplitKGemm = cutlass::gemm::device::GemmSplitKParallel<
+        // 	float,
+        // 	cutlass::layout::ColumnMajor,
+        // 	float,
+        // 	cutlass::layout::RowMajor,
+        // 	float,
+        // 	cutlass::layout::RowMajor,
+        // 	TypeAccumulator,
+        // 	MMAOp,
+        // 	SmArch,
+        // 	cutlass::gemm::GemmShape<128, 128, 32>, 
+        // 	cutlass::gemm::GemmShape<64, 32, 32>,
+        // 	cutlass::gemm::GemmShape<16, 8, 8>,
+        // 	epi
+        // >;
+
+        using OurGemmW = cutlass::gemm::device::Gemm<
+            float,
+            cutlass::layout::ColumnMajor,
+            float,
+            cutlass::layout::RowMajor,
+            float,
+            cutlass::layout::RowMajor,
+            TypeAccumulator,
+            MMAOp,
+            SmArch,
+            cutlass::gemm::GemmShape<128, 128, 32>, 
+            cutlass::gemm::GemmShape<64, 64, 32>,
+            cutlass::gemm::GemmShape<16, 8, 8>,
+            ActivationEpilogue<float, 4, TypeAccumulator, TypeCompute>,
+            SwizzleThreadBlock,
+            2
+        >;
+
+        void backward(float* input, float* grad, float* output, float* output_grad, int input_m, int input_n, int grad_m, int grad_n, int output_m, int output_n) { // grad * weight.T
+            using Gemm = OurGemmTransfer;
+            const int lda = grad_n;
+            const int ldb = input_m;
+            const int ldc = output_n;
+            const int ldd = output_n;
+            typename Gemm::Arguments arguments{
+                {grad_m, input_n, grad_n}, // TODO
+                {grad, lda},
+                {input, ldb},
+                {output, ldc},
+                {output_grad, ldc},
+                {Activation::ReLU},
+                1
+            };
+            size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+            // Instantiate CUTLASS kernel depending on templates
+            Gemm gemm_op;
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+            cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+            CUTLASS_CHECK(status);
+
+            // Launch initialized CUTLASS kernel
+            status = gemm_op(0);
+            CUTLASS_CHECK(status);
+        }
+
+        void last_inp_backward(float* input, float* grad, float* output, int input_m, int input_n, int grad_m, int grad_n, int output_m, int output_n) { // output * weight.T
+            using Gemm = OurGemm;
+            const int lda = grad_n;
+            const int ldb = input_m;
+            const int ldc = output_n;
+            const int ldd = output_n;
+            typename Gemm::Arguments arguments{
+                {grad_m, input_n, grad_n}, // TODO
+                {grad, lda},
+                {input, ldb},
+                {output, ldc},
+                {output, ldc},
+                {Activation::None, false},
+                1
+            };
+            size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+            // Instantiate CUTLASS kernel depending on templates
+            Gemm gemm_op;
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+            cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+            CUTLASS_CHECK(status);
+
+            // Launch initialized CUTLASS kernel
+            status = gemm_op();
+            CUTLASS_CHECK(status);
+        }
+
+        void weight_backward(float* input, float* grad, float* weight_grad, int input_m, int input_n, int grad_m, int grad_n, int weight_grad_m, int weight_grad_n) { // A.T * GRAD
+
+            int batch_size = grad_n;
+
+            using Gemm = OurGemmW;
+            const int lda = input_m;
+            const int ldb = grad_n;
+            const int ldc = weight_grad_n;
+            const int ldd = weight_grad_n;
+            typename Gemm::Arguments arguments{
+                {input_n, grad_n, grad_m}, // TODO
+                {input, lda},
+                {grad, ldb},
+                {weight_grad, ldc},
+                {weight_grad, ldc},
+                {Activation::None, false},
+                1
+            };
+            size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+            // Instantiate CUTLASS kernel depending on templates
+            Gemm gemm_op;
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+            cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+            CUTLASS_CHECK(status);
+
+            // Launch initialized CUTLASS kernel
+            status = gemm_op();
+            CUTLASS_CHECK(status);
+        }
+    '''
+
+class FullyFusedMlp(jt.Function):
+    def __init__(self):
+        self.input = None
+        self.outputs = []
+        self.shapes = []
+        self.dtypes = []
+        self.weights_grad = []
+        self.max_dim = 0
+        self.weights = None
+
+    def single_forward(self, a, b):
+        cuda_src = '''
+            @alias(b, in1)
+            @alias(a, in0)
+            @alias(c, out0)
+            const int length_m = a_shape0;
+            const int length_n = b_shape1;
+            const int length_k = a_shape1;
+
+            // Create a tuple of problem size for matrix multiplication
+            cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+
+            // Initialize tensors using CUTLASS helper functions
+            cutlass::TensorRef<ElementInputA, LayoutInputA> tensor_a((ElementInputA*)a_p,
+                LayoutInputA().packed(problem_size.mk()));
+            cutlass::TensorRef<ElementInputB, LayoutInputB> tensor_b((ElementInputB*)b_p,
+                LayoutInputB().packed(problem_size.kn()));
+            cutlass::TensorRef<ElementOutput, LayoutOutput> tensor_d((ElementOutput*)c_p,
+                LayoutOutput().packed(problem_size.mn()));
+
+            // Initialize alpha for dot product computation
+            ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+
+            // Split K dimension into 1 partitions
+            int split_k_slices = 1;
+
+            // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+            // instantiated CUTLASS kernel
+            typename Gemm::Arguments arguments{
+                problem_size,                       // <- problem size of matrix multiplication
+                tensor_a,              // <- reference to matrix A on device
+                tensor_b,              // <- reference to matrix B on device
+
+                {NULL, 0},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
+                                                    //    to project away the N dimension by setting the stride to zero.
+
+                tensor_d,              // <- reference to matrix D on device,
+                {alpha},                              // <- alpha
+                split_k_slices};                    // <- k-dimension split factor
+
+            // Using the arguments, query for extra workspace required for matrix multiplication computation
+            size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+
+            // Allocate workspace memory
+            auto temp1 = exe.alloc_temp(workspace_size);
+
+            // Allocate workspace memory
+            // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+            // Instantiate CUTLASS kernel depending on templates
+            Gemm gemm_op;
+
+            // Check the problem size is supported or not 
+            cutlass::Status status = gemm_op.can_implement(arguments);
+            CUTLASS_CHECK(status);
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            status = gemm_op.initialize(arguments, temp1.ptr);
+            CUTLASS_CHECK(status);
+
+            // Launch initialized CUTLASS kernel
+            status = gemm_op();
+            CUTLASS_CHECK(status);  
+        '''
+        output = jt.code((a.shape[0], b.shape[1]), a.dtype, [a, b], cuda_header=cuda_header, cuda_src=cuda_src)
+        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return output
+    
+    def execute(self, a, *args):
+        self.shapes = []
+        self.dtypes = []
+        self.max_dim = 0
+        self.weights = list(args)
+        weights = args
+        cuda_header = '''
+            #undef out
+            #include <algorithm>
+            #include <iostream>
+
+            #include <cutlass/cutlass.h>
+            #include <cutlass/gemm/device/gemm.h>
+            #include <cutlass/epilogue/thread/linear_combination_relu.h>
+            #include <cutlass/epilogue/thread/linear_combination_drelu.h>
+            #include <cutlass/util/host_tensor.h>
+            #include <cutlass/util/reference/device/gemm.h>
+            #include <cutlass/util/reference/host/tensor_compare.h>
+            #include <cutlass/util/reference/host/tensor_copy.h>
+            #include <cutlass/util/reference/host/tensor_fill.h>
+            #include <cutlass/util/tensor_view_io.h>
+            #include "executor.h"
+
+            #define CUTLASS_CHECK(status)                                                 \
+                {                                                                         \
+                    cutlass::Status error = status;                                       \
+                    if (error != cutlass::Status::kSuccess) {                             \
+                        std::cerr << "Got cutlass error: "                                \
+                                << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                                << std::endl;                                           \
+                        exit(EXIT_FAILURE);                                               \
+                    }                                                                     \
+                }
+
+            #define CUDA_CHECK(status)                                                    \
+                {                                                                         \
+                    cudaError_t error = status;                                           \
+                    if (error != cudaSuccess) {                                           \
+                        std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
+                                << " at line: " << __LINE__ << std::endl;               \
+                        exit(EXIT_FAILURE);                                               \
+                    }                                                                     \
+                }
+
+            // The code section below describes datatype for input, output matrices and computation between
+            // elements in input matrices.
+            using ElementAccumulator = float;                   // <- data type of accumulator
+            using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+            using ElementInputA = float;              // <- data type of elements in input matrix A
+            using ElementInputB = float;              // <- data type of elements in input matrix B
+            using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+            // The code section below describes matrix layout of input and output matrices.
+            // Column Major for Matrix A, B and C.
+            //
+            // Note this example only works for ColumnMajor output because
+            //   1) we only have row major epilogue.
+            //   2) we swap A and B if the output is column major then we can still use the
+            //      row major epilogue.
+            //   3) Mx1 bias vector becomes 1xM after the swapping/transposing.
+            //   4) we can use the existing OutputIterator to load 1xM bias vector.
+
+            using LayoutInputA = cutlass::layout::RowMajor;
+            using LayoutInputB = cutlass::layout::RowMajor;
+            using LayoutOutput = cutlass::layout::RowMajor;
+
+            // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+            using MMAOp = cutlass::arch::OpClassTensorOp;
+
+            // This code section describes CUDA SM architecture number
+            using SmArch = cutlass::arch::Sm75;
+
+            // This code section describes the tile size a thread block will compute
+            using ShapeMMAThreadBlock =
+                cutlass::gemm::GemmShape<128, 128, 32>;  // <- threadblock tile M = 128, N = 128, K = 32
+            // This code section describes tile size a warp will compute
+            using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M = 64, N = 64, K = 32 
+            // This code section describes the size of MMA op
+            using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 8, N = 8, K = 4
+
+            // This code section describes how threadblocks are scheduled on GPU
+            using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+            // Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
+            //
+            //    d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
+            //
+            using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
+                ElementOutput,                                        // <- data type of output matrix
+                128 / cutlass::sizeof_bits<ElementOutput>::value,     // <- this is the number of elements per
+                                                                    // vectorized memory access. For half
+                                                                    // precision, it's 8 elements. This becomes
+                                                                    // the vector width of math instructions in
+                                                                    // epilogue too
+                ElementAccumulator,                                   // <- data type of accumulator
+                ElementComputeEpilogue,                               // <- data type for alpha in linear combination function
+                cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias
+
+            // Number of pipelines you want to use
+            constexpr int NumStages = 2;
+
+            using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
+                                                    LayoutInputA,
+                                                    ElementInputB,
+                                                    LayoutInputB,
+                                                    ElementOutput,
+                                                    LayoutOutput,
+                                                    ElementAccumulator,
+                                                    MMAOp,
+                                                    SmArch,
+                                                    ShapeMMAThreadBlock,
+                                                    ShapeMMAWarp,
+                                                    ShapeMMAOp,
+                                                    EpilogueOp,
+                                                    SwizzleThreadBlock,
+                                                    NumStages>;
+        '''
+        cuda_src = '''
+            int length_m;
+            int length_n;
+            int length_k;
+            ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+            int split_k_slices = 1;
+            cutlass::Status status;
+            float* a_p;
+        '''
+        for i in range(len(weights)):
+            if i == 0:
+                assert a.shape[1] == weights[0].shape[0]
+            else:
+                assert weights[i-1].shape[1] == weights[i].shape[0]
+            mm = np.maximum(weights[i].shape[0], weights[i].shape[1])
+            if self.max_dim < mm: self.max_dim = mm
+            self.shapes.append((a.shape[0], weights[i].shape[1]))
+            self.dtypes.append(a.dtype)
+            cuda_src += f'''
+            if ({i} == 0) {{
+                length_m = in0_shape0;
+                length_n = in1_shape1;
+                length_k = in0_shape1;
+            }} else {{
+                length_m = in0_shape0;
+                length_n = in{i}_shape1;
+                length_k = in{0 if i == 0 else i-1}_shape1;
+            }}
+
+            if ({i} == 0) a_p = in0_p;
+            else 
+                a_p = out{0 if i == 0 else i-1}_p;
+            // Create a tuple of problem size for matrix multiplication
+            cutlass::gemm::GemmCoord problem_size{i}(length_m, length_n, length_k);
+
+            // Initialize tensors using CUTLASS helper functions
+            cutlass::TensorRef<ElementInputA, LayoutInputA> tensor_a{i}((ElementInputA*)a_p,
+                LayoutInputA().packed(problem_size{i}.mk()));
+            cutlass::TensorRef<ElementInputB, LayoutInputB> tensor_b{i}((ElementInputB*)in{i+1}_p,
+                LayoutInputB().packed(problem_size{i}.kn()));
+            cutlass::TensorRef<ElementOutput, LayoutOutput> tensor_d{i}((ElementOutput*)out{i}_p,
+                LayoutOutput().packed(problem_size{i}.mn()));
+
+            // instantiated CUTLASS kernel
+            typename Gemm::Arguments arguments{i}{{
+                problem_size{i},                       // <- problem size of matrix multiplication
+                tensor_a{i},              // <- reference to matrix A on device
+                tensor_b{i},              // <- reference to matrix B on device
+
+                {{NULL, 0}},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
+                                                    //    to project away the N dimension by setting the stride to zero.
+
+                tensor_d{i},              // <- reference to matrix D on device,
+                {{alpha}},                              // <- alpha
+                split_k_slices}};                    // <- k-dimension split factor
+
+            // Using the arguments, query for extra workspace required for matrix multiplication computation
+            size_t workspace_size{i} = Gemm::get_workspace_size(arguments{i});
+
+
+            // Allocate workspace memory
+            auto temp{i} = exe.alloc_temp(workspace_size{i});
+
+            // Allocate workspace memory
+            // cutlass::device_memory::allocation<uint8_t> workspace{i}(workspace_size);
+
+            // Instantiate CUTLASS kernel depending on templates
+            Gemm gemm_op{i};
+
+            // Check the problem size is supported or not 
+            status = gemm_op{i}.can_implement(arguments{i});
+            CUTLASS_CHECK(status);
+
+            // Initialize CUTLASS kernel with arguments and workspace pointer
+            status = gemm_op{i}.initialize(arguments{i}, temp{i}.ptr);
+            CUTLASS_CHECK(status);
+
+            // Launch initialized CUTLASS kernel
+            status = gemm_op{i}();
+            CUTLASS_CHECK(status);  
+            '''
+        self.input = a
+        self.outputs = jt.code(self.shapes, self.dtypes, [a] + self.weights, cuda_header=cuda_header, cuda_src=cuda_src)
+        self.outputs[0].compile_options = {f"FLAGS: --expt-relaxed-constexpr --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return self.outputs[-1]
+
+    def backward(self, grad, weight, output):
+        cuda_header = backward_header()
+        cuda_src = '''
+        @alias(input, in0)
+        @alias(grad, in1)
+        @alias(weight, in2)
+        @alias(weight_grad, out0)  
+        @alias(inp_grad, out1)
+        weight_backward(input_p, grad_p, weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1); 
+        backward(weight_p, grad_p, input_p, inp_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1);
+        '''
+        weight_grad, out_grad = jt.code([weight.shape, output.shape], [weight.dtype, output.dtype], [output, grad, weight], cuda_header=cuda_header, cuda_src=cuda_src)
+        weight_grad.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return out_grad, weight_grad
+
+    def last_backward(self, grad, weight, output):
+        cuda_header = backward_header()
+        cuda_src = '''
+        @alias(input, in0)
+        @alias(grad, in1)  
+        @alias(weight, in2) 
+        @alias(weight_grad, out0) 
+        @alias(out_grad, out1)
+        weight_backward(input_p, grad_p, weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1);
+        last_inp_backward(weight_p, grad_p, out_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1); 
+        '''
+        weight_grad, out_grad = jt.code([weight.shape, output.shape], [weight.dtype, output.dtype], [output, grad, weight], cuda_header=cuda_header, cuda_src=cuda_src)
+        weight_grad.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        return out_grad, weight_grad
+
+    def grad(self, grads):
+        self.weights_grad = []
+        output = self.outputs[-1]
+        grads[output == 0] = 0
+        num_hidden = len(self.weights)-1
+        for idx in range(num_hidden, -1, -1):
+            if idx == 0:
+                grads, weight_grad = self.last_backward(grads, self.weights[0], self.input)
+                self.weights_grad.insert(0, weight_grad)
+            else:
+                grads, weight_grad = self.backward(grads, self.weights[idx], self.outputs[idx - 1])
+                self.weights_grad.insert(0, weight_grad)
+        return (grads, *self.weights_grad)
\ No newline at end of file
diff --git a/python/jittor/nn.py b/python/jittor/nn.py
index 852d4f96..b245748c 100644
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@@ -760,7 +760,7 @@ def group_norm(x,
 if os.environ.get('use_cutlass') == '0':
     from jittor.depthwise_conv import DepthwiseConv
 else:
-    from jittor.cutlass_depthwise_conv import DepthwiseConv
+    from jittor.cutlass_ops import DepthwiseConv, FullyFusedMlp
 
 class Conv(Module):
     ''' Applies a 2D convolution over an input signal composed of several input planes.
@@ -2802,4 +2802,29 @@ def _fft2(x, inverse=False):
     y = jt.compile_extern.cufft_ops.cufft_fft(x, inverse)
     if inverse:
         y /= x.shape[1] * x.shape[2]
-    return y
\ No newline at end of file
+    return y
+
+
+class FullyFusedMLP(Module):
+    ''' fusing multiple linear layers in one FullyFusedMLP layer using ReLU activation.
+    Example::
+
+    m = nn.FullyFusedMLP((20, 30, 40)) # with 2 layers
+    input1 = jt.randn(128, 20)
+    output = m(input1)
+    print(output.shape)
+    # [128, 40]
+    '''
+    def __init__(self, weights_width=None, weights=None):
+        assert os.environ.get("use_cutlass") == '1', "Need cutlass support!"
+        if weights != None:
+            self.weights = weights
+        else:
+            assert weights_width != None, "All inputs are None."
+            for idx in range(len(weights_width)-1):
+                self.weights.append(jt.randn(weights_width[idx], weights_width[idx+1]))
+        
+        self.ops = FullyFusedMlp()
+    
+    def execute(self, a):
+        return self.ops(a, *self.weights)
\ No newline at end of file

From b3997786696e64349b46466d61b456b506354859 Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Tue, 3 May 2022 04:36:07 -0400
Subject: [PATCH 4/7] fix FMLP bugs.

---
 python/jittor/compile_extern.py |   1 -
 python/jittor/cutlass_ops.py    | 192 ++++++++++----------------------
 2 files changed, 57 insertions(+), 136 deletions(-)

diff --git a/python/jittor/compile_extern.py b/python/jittor/compile_extern.py
index 8694002d..8217edde 100644
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@@ -613,7 +613,6 @@ def setup_cutlass():
         extra_flags=f" {all_dir} ")
     LOG.vv("Get cutlass_ops: "+str(dir(cutlass_ops)))
 
-if os.environ.get("FIX_TORCH_ERROR", "0") == "1":
 in_mpi = inside_mpi()
 FIX_TORCH_ERROR = 0
 if os.name != 'nt' and not in_mpi:
diff --git a/python/jittor/cutlass_ops.py b/python/jittor/cutlass_ops.py
index a1bf50da..c42427e6 100644
--- a/python/jittor/cutlass_ops.py
+++ b/python/jittor/cutlass_ops.py
@@ -1177,11 +1177,11 @@ class ActivationTransferEpilogue {
         void backward(float* input, float* grad, float* output, float* output_grad, int input_m, int input_n, int grad_m, int grad_n, int output_m, int output_n) { // grad * weight.T
             using Gemm = OurGemmTransfer;
             const int lda = grad_n;
-            const int ldb = input_m;
+            const int ldb = input_n;
             const int ldc = output_n;
             const int ldd = output_n;
             typename Gemm::Arguments arguments{
-                {grad_m, input_n, grad_n}, // TODO
+                {grad_m, input_m, grad_n}, // TODO
                 {grad, lda},
                 {input, ldb},
                 {output, ldc},
@@ -1207,11 +1207,11 @@ class ActivationTransferEpilogue {
         void last_inp_backward(float* input, float* grad, float* output, int input_m, int input_n, int grad_m, int grad_n, int output_m, int output_n) { // output * weight.T
             using Gemm = OurGemm;
             const int lda = grad_n;
-            const int ldb = input_m;
+            const int ldb = input_n;
             const int ldc = output_n;
             const int ldd = output_n;
             typename Gemm::Arguments arguments{
-                {grad_m, input_n, grad_n}, // TODO
+                {grad_m, input_m, grad_n}, // TODO
                 {grad, lda},
                 {input, ldb},
                 {output, ldc},
@@ -1239,7 +1239,7 @@ class ActivationTransferEpilogue {
             int batch_size = grad_n;
 
             using Gemm = OurGemmW;
-            const int lda = input_m;
+            const int lda = input_n;
             const int ldb = grad_n;
             const int ldc = weight_grad_n;
             const int ldd = weight_grad_n;
@@ -1279,80 +1279,6 @@ def __init__(self):
         self.weights = None
 
     def single_forward(self, a, b):
-        cuda_src = '''
-            @alias(b, in1)
-            @alias(a, in0)
-            @alias(c, out0)
-            const int length_m = a_shape0;
-            const int length_n = b_shape1;
-            const int length_k = a_shape1;
-
-            // Create a tuple of problem size for matrix multiplication
-            cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
-
-            // Initialize tensors using CUTLASS helper functions
-            cutlass::TensorRef<ElementInputA, LayoutInputA> tensor_a((ElementInputA*)a_p,
-                LayoutInputA().packed(problem_size.mk()));
-            cutlass::TensorRef<ElementInputB, LayoutInputB> tensor_b((ElementInputB*)b_p,
-                LayoutInputB().packed(problem_size.kn()));
-            cutlass::TensorRef<ElementOutput, LayoutOutput> tensor_d((ElementOutput*)c_p,
-                LayoutOutput().packed(problem_size.mn()));
-
-            // Initialize alpha for dot product computation
-            ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
-
-            // Split K dimension into 1 partitions
-            int split_k_slices = 1;
-
-            // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
-            // instantiated CUTLASS kernel
-            typename Gemm::Arguments arguments{
-                problem_size,                       // <- problem size of matrix multiplication
-                tensor_a,              // <- reference to matrix A on device
-                tensor_b,              // <- reference to matrix B on device
-
-                {NULL, 0},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
-                                                    //    to project away the N dimension by setting the stride to zero.
-
-                tensor_d,              // <- reference to matrix D on device,
-                {alpha},                              // <- alpha
-                split_k_slices};                    // <- k-dimension split factor
-
-            // Using the arguments, query for extra workspace required for matrix multiplication computation
-            size_t workspace_size = Gemm::get_workspace_size(arguments);
-
-
-            // Allocate workspace memory
-            auto temp1 = exe.alloc_temp(workspace_size);
-
-            // Allocate workspace memory
-            // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-            // Instantiate CUTLASS kernel depending on templates
-            Gemm gemm_op;
-
-            // Check the problem size is supported or not 
-            cutlass::Status status = gemm_op.can_implement(arguments);
-            CUTLASS_CHECK(status);
-
-            // Initialize CUTLASS kernel with arguments and workspace pointer
-            status = gemm_op.initialize(arguments, temp1.ptr);
-            CUTLASS_CHECK(status);
-
-            // Launch initialized CUTLASS kernel
-            status = gemm_op();
-            CUTLASS_CHECK(status);  
-        '''
-        output = jt.code((a.shape[0], b.shape[1]), a.dtype, [a, b], cuda_header=cuda_header, cuda_src=cuda_src)
-        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
-        return output
-    
-    def execute(self, a, *args):
-        self.shapes = []
-        self.dtypes = []
-        self.max_dim = 0
-        self.weights = list(args)
-        weights = args
         cuda_header = '''
             #undef out
             #include <algorithm>
@@ -1465,89 +1391,85 @@ def execute(self, a, *args):
                                                     NumStages>;
         '''
         cuda_src = '''
-            int length_m;
-            int length_n;
-            int length_k;
-            ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
-            int split_k_slices = 1;
-            cutlass::Status status;
-            float* a_p;
-        '''
-        for i in range(len(weights)):
-            if i == 0:
-                assert a.shape[1] == weights[0].shape[0]
-            else:
-                assert weights[i-1].shape[1] == weights[i].shape[0]
-            mm = np.maximum(weights[i].shape[0], weights[i].shape[1])
-            if self.max_dim < mm: self.max_dim = mm
-            self.shapes.append((a.shape[0], weights[i].shape[1]))
-            self.dtypes.append(a.dtype)
-            cuda_src += f'''
-            if ({i} == 0) {{
-                length_m = in0_shape0;
-                length_n = in1_shape1;
-                length_k = in0_shape1;
-            }} else {{
-                length_m = in0_shape0;
-                length_n = in{i}_shape1;
-                length_k = in{0 if i == 0 else i-1}_shape1;
-            }}
-
-            if ({i} == 0) a_p = in0_p;
-            else 
-                a_p = out{0 if i == 0 else i-1}_p;
+            @alias(b, in1)
+            @alias(a, in0)
+            @alias(c, out0)
+            const int length_m = a_shape0;
+            const int length_n = b_shape1;
+            const int length_k = a_shape1;
+
             // Create a tuple of problem size for matrix multiplication
-            cutlass::gemm::GemmCoord problem_size{i}(length_m, length_n, length_k);
+            cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
 
             // Initialize tensors using CUTLASS helper functions
-            cutlass::TensorRef<ElementInputA, LayoutInputA> tensor_a{i}((ElementInputA*)a_p,
-                LayoutInputA().packed(problem_size{i}.mk()));
-            cutlass::TensorRef<ElementInputB, LayoutInputB> tensor_b{i}((ElementInputB*)in{i+1}_p,
-                LayoutInputB().packed(problem_size{i}.kn()));
-            cutlass::TensorRef<ElementOutput, LayoutOutput> tensor_d{i}((ElementOutput*)out{i}_p,
-                LayoutOutput().packed(problem_size{i}.mn()));
+            cutlass::TensorRef<ElementInputA, LayoutInputA> tensor_a((ElementInputA*)a_p,
+                LayoutInputA().packed(problem_size.mk()));
+            cutlass::TensorRef<ElementInputB, LayoutInputB> tensor_b((ElementInputB*)b_p,
+                LayoutInputB().packed(problem_size.kn()));
+            cutlass::TensorRef<ElementOutput, LayoutOutput> tensor_d((ElementOutput*)c_p,
+                LayoutOutput().packed(problem_size.mn()));
 
+            // Initialize alpha for dot product computation
+            ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+
+            // Split K dimension into 1 partitions
+            int split_k_slices = 1;
+
+            // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
             // instantiated CUTLASS kernel
-            typename Gemm::Arguments arguments{i}{{
-                problem_size{i},                       // <- problem size of matrix multiplication
-                tensor_a{i},              // <- reference to matrix A on device
-                tensor_b{i},              // <- reference to matrix B on device
+            typename Gemm::Arguments arguments{
+                problem_size,                       // <- problem size of matrix multiplication
+                tensor_a,              // <- reference to matrix A on device
+                tensor_b,              // <- reference to matrix B on device
 
-                {{NULL, 0}},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
+                {NULL, 0},   // <- the C matrix is treated as the bias vector. We can enable the GEMM
                                                     //    to project away the N dimension by setting the stride to zero.
 
-                tensor_d{i},              // <- reference to matrix D on device,
-                {{alpha}},                              // <- alpha
-                split_k_slices}};                    // <- k-dimension split factor
+                tensor_d,              // <- reference to matrix D on device,
+                {alpha},                              // <- alpha
+                split_k_slices};                    // <- k-dimension split factor
 
             // Using the arguments, query for extra workspace required for matrix multiplication computation
-            size_t workspace_size{i} = Gemm::get_workspace_size(arguments{i});
+            size_t workspace_size = Gemm::get_workspace_size(arguments);
 
 
             // Allocate workspace memory
-            auto temp{i} = exe.alloc_temp(workspace_size{i});
+            auto temp1 = exe.alloc_temp(workspace_size);
 
             // Allocate workspace memory
-            // cutlass::device_memory::allocation<uint8_t> workspace{i}(workspace_size);
+            // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
             // Instantiate CUTLASS kernel depending on templates
-            Gemm gemm_op{i};
+            Gemm gemm_op;
 
             // Check the problem size is supported or not 
-            status = gemm_op{i}.can_implement(arguments{i});
+            cutlass::Status status = gemm_op.can_implement(arguments);
             CUTLASS_CHECK(status);
 
             // Initialize CUTLASS kernel with arguments and workspace pointer
-            status = gemm_op{i}.initialize(arguments{i}, temp{i}.ptr);
+            status = gemm_op.initialize(arguments, temp1.ptr);
             CUTLASS_CHECK(status);
 
             // Launch initialized CUTLASS kernel
-            status = gemm_op{i}();
+            status = gemm_op();
             CUTLASS_CHECK(status);  
-            '''
+        '''
+        output = jt.code((a.shape[0], b.shape[1]), a.dtype, [a, b], cuda_header=cuda_header, cuda_src=cuda_src)
+        output.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I/home/penghy/jittor-dev/depth/cutlass/include -I/home/penghy/jittor-dev/depth/cutlass/tools/util/include ": 1}
+        # print(output)
+        return output
+    
+    def execute(self, a, *args):
+        self.shapes = []
+        self.dtypes = []
+        self.max_dim = 0
+        self.weights = list(args)
         self.input = a
-        self.outputs = jt.code(self.shapes, self.dtypes, [a] + self.weights, cuda_header=cuda_header, cuda_src=cuda_src)
-        self.outputs[0].compile_options = {f"FLAGS: --expt-relaxed-constexpr --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
+        weights = args
+        for i in range(len(weights)):
+            self.outputs.append(self.single_forward(a, weights[i]))
+            a = self.outputs[-1]
+        # print(self.outputs)
         return self.outputs[-1]
 
     def backward(self, grad, weight, output):

From 4ec83f017f64716787297104524823b401ecb50d Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Tue, 3 May 2022 23:30:55 -0400
Subject: [PATCH 5/7] add fp16 support.

---
 python/jittor/cutlass_ops.py | 37 ++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/python/jittor/cutlass_ops.py b/python/jittor/cutlass_ops.py
index c42427e6..1155f7d0 100644
--- a/python/jittor/cutlass_ops.py
+++ b/python/jittor/cutlass_ops.py
@@ -203,7 +203,8 @@ def depthwise_src_backward(x, weights):
     #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 
     '''
-
+    if x.dtype == jt.float16: 
+        cuda_header = cuda_header.replace("float", "cutlass::half_t")
     cuda_src = '''
     @alias(weights, in1)      
     @alias(x, in0)
@@ -452,7 +453,8 @@ def depthwise_filter_backward(x, weights, diff):
     #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
     #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
     '''
-
+    if x.dtype == jt.float16: 
+        cuda_header = cuda_header.replace("float", "cutlass::half_t")
     cuda_src = '''
     @alias(grad, in2)
     @alias(weights, in1)      
@@ -709,6 +711,8 @@ def execute(self, x, weights):
             }
 
         '''
+        if x.dtype == jt.float16: 
+            cuda_header.replace("float", "cutlass::half_t")
         cuda_src = '''
             // __global__ void kernel() {}
             @alias(weights, in1)      
@@ -770,8 +774,8 @@ def execute(self, x, weights):
     def grad(self, g):
         return depthwise_src_backward(g, self.weights), depthwise_filter_backward(self.x, self.weights, g)
 
-def backward_header():
-    return '''
+def backward_header(use_fp16 = False):
+    cuda_header =  '''
         #pragma once
         #undef out
         #include <cutlass/cutlass.h>
@@ -1267,6 +1271,9 @@ class ActivationTransferEpilogue {
             CUTLASS_CHECK(status);
         }
     '''
+    if use_fp16: 
+        cuda_header = cuda_header.replace("float", "cutlass::half_t")
+    return cuda_header
 
 class FullyFusedMlp(jt.Function):
     def __init__(self):
@@ -1390,6 +1397,8 @@ def single_forward(self, a, b):
                                                     SwizzleThreadBlock,
                                                     NumStages>;
         '''
+        if a.dtype == jt.float16: 
+            cuda_header = cuda_header.replace("float", "cutlass::half_t")
         cuda_src = '''
             @alias(b, in1)
             @alias(a, in0)
@@ -1473,30 +1482,34 @@ def execute(self, a, *args):
         return self.outputs[-1]
 
     def backward(self, grad, weight, output):
-        cuda_header = backward_header()
-        cuda_src = '''
+        use_fp16 = True if grad.dtype == jt.float16 else False
+        cuda_header = backward_header(use_fp16)
+        converter = "(cutlass::half_t*)" if use_fp16 else ""
+        cuda_src = f'''
         @alias(input, in0)
         @alias(grad, in1)
         @alias(weight, in2)
         @alias(weight_grad, out0)  
         @alias(inp_grad, out1)
-        weight_backward(input_p, grad_p, weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1); 
-        backward(weight_p, grad_p, input_p, inp_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1);
+        weight_backward({converter}input_p, {converter}grad_p, {converter}weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1); 
+        backward({converter}weight_p, {converter}grad_p, {converter}input_p, {converter}inp_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1);
         '''
         weight_grad, out_grad = jt.code([weight.shape, output.shape], [weight.dtype, output.dtype], [output, grad, weight], cuda_header=cuda_header, cuda_src=cuda_src)
         weight_grad.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}
         return out_grad, weight_grad
 
     def last_backward(self, grad, weight, output):
-        cuda_header = backward_header()
-        cuda_src = '''
+        use_fp16 = True if grad.dtype == jt.float16 else False
+        cuda_header = backward_header(use_fp16)
+        converter = "(cutlass::half_t*)" if use_fp16 else ""
+        cuda_src = f'''
         @alias(input, in0)
         @alias(grad, in1)  
         @alias(weight, in2) 
         @alias(weight_grad, out0) 
         @alias(out_grad, out1)
-        weight_backward(input_p, grad_p, weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1);
-        last_inp_backward(weight_p, grad_p, out_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1); 
+        weight_backward({converter}input_p, {converter}grad_p, {converter}weight_grad_p, input_shape0, input_shape1, grad_shape0, grad_shape1, weight_shape0, weight_shape1);
+        last_inp_backward({converter}weight_p, {converter}grad_p, {converter}out_grad_p, weight_shape0, weight_shape1, grad_shape0, grad_shape1, input_shape0, input_shape1); 
         '''
         weight_grad, out_grad = jt.code([weight.shape, output.shape], [weight.dtype, output.dtype], [output, grad, weight], cuda_header=cuda_header, cuda_src=cuda_src)
         weight_grad.compile_options = {f"FLAGS: --expt-relaxed-constexpr -I{cutlass_path}/include -I{cutlass_path}/tools/util/include ": 1}

From db9a6086f67aaa8c2cd881e26cfc2d8a19f1a4ea Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Mon, 9 May 2022 22:22:10 -0400
Subject: [PATCH 6/7] add cublas support.

---
 .../cuda/cublas/ops/cublas_acc_matmul_op.cc   | 137 ++++++++++++++++++
 .../cuda/cublas/ops/cublas_acc_matmul_op.h    |  27 ++++
 python/jittor/nn.py                           |   1 +
 python/jittor/src/profiler/profiler.cc        |   5 +-
 4 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.cc
 create mode 100644 python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.h

diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.cc b/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.cc
new file mode 100644
index 00000000..a6b280e0
--- /dev/null
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.cc
@@ -0,0 +1,137 @@
+// ***************************************************************
+// Copyright (c) 2022 Jittor. All Rights Reserved. 
+// Maintainers: 
+//     Guowei Yang <471184555@qq.com>
+//     Dun Liang <randonlang@gmail.com>. 
+// 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+
+#include "var.h"
+#include "cublas_acc_matmul_op.h"
+#include "cublas_wrapper.h"
+
+using namespace std;
+
+namespace jittor {
+
+extern int use_tensorcore;
+
+#ifndef JIT
+
+CublasAccMatmulOp::CublasAccMatmulOp(Var* a, Var* b, bool trans_a, bool trans_b, int stride_a, int stride_b, int offset_a, int offset_b)
+    : a(a), b(b), trans_a(trans_a), trans_b(trans_b),stride_a(stride_a),stride_b(stride_b),offset_a(offset_a),offset_b(offset_b) {
+    flags.set(NodeFlags::_cuda, 1);
+    flags.set(NodeFlags::_cpu, 0);
+    flags.set(NodeFlags::_manual_set_vnbb);
+    a->flags.set(NodeFlags::_needed_by_backward);
+    b->flags.set(NodeFlags::_needed_by_backward);
+    // TODO: support int8 * int8
+    ASSERT(a->dtype().is_float() && b->dtype().is_float()) << "type of two inputs should be the same";
+    // TODO: support diffrent input type
+    ASSERT(a->dtype().dsize() == b->dtype().dsize()) << "type of two inputs should be the same";
+    c = create_output(nullptr, a->dtype());
+}
+
+void CublasAccMatmulOp::infer_shape() {
+    ASSERTop(a->shape.size(),==,2);
+    ASSERTop(b->shape.size(),==,2);
+    int n = a->shape[0], m = a->shape[1];
+    int m_ = b->shape[0], k = b->shape[1];
+    if (trans_a) {
+        swap(n, m);
+    }
+    if (trans_b) {
+        swap(m_, k);
+    }
+    ASSERTop(m,==,m_);
+    if(stride_a != -1)
+        n = stride_a;
+    if(stride_b != -1)
+        k = stride_b;
+    c->set_shape({n, k});
+}
+
+void CublasAccMatmulOp::jit_prepare(JK& jk) {
+    jk << _CS("[T:") << a->dtype();
+    jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
+    jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
+    jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
+    jk << ']';
+}
+
+#else // JIT
+#pragma clang diagnostic ignored "-Wtautological-compare"
+
+void CublasAccMatmulOp::jit_run() {
+    cublasHandle_t& handle_ = cublas_handle;
+    const T alpha = 1.0f;
+    const T beta  = 0.0f;
+
+    const auto& as = a->shape;
+    const auto& bs = b->shape;
+    auto n = as[0];
+    auto m = as[1];
+    auto k = bs[1];
+    if ('@Trans_a'=='T') {
+        n = as[1];
+        m = as[0];
+    }
+    if ('@Trans_b'=='T') {
+        k = bs[0];
+    }
+
+    // a: [n,m], b: [m,k], c: [n,k]
+    #if CUDART_VERSION >= 11000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+    if (use_tensorcore>=3) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_16F;
+    } else if (use_tensorcore==2) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_16BF;
+    } else if (use_tensorcore==1) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUBLAS_COMPUTE_16F;
+    }
+    #else
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cudaDataType_t computeType = get_dtype(c->dtype());
+    if (use_tensorcore) {
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUDA_R_16F;
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    #endif
+    int ldb, lda;
+    ldb = '@Trans_b' == 'N' ? k : m;
+    lda = '@Trans_a' == 'N' ? m : n;
+    if(stride_b != -1)
+        k = stride_b;
+    // if(stride_a != -1)
+    //     n = stride_a;
+    checkCudaErrors(cublasGemmEx(handle_, 
+    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
+    k, n, m, &alpha, 
+    b->ptr<T>() + offset_b,get_dtype(b->dtype()), ldb, 
+    a->ptr<T>() + offset_a,get_dtype(a->dtype()), lda, &beta, 
+    c->ptr<T>(),get_dtype(c->dtype()), k,
+    computeType, algo));
+    // checkCudaErrors(cublas@op@@gemm(handle_, 
+    // CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
+    // k, n, m, &alpha, 
+    // b->ptr<T>(), '@Trans_b' == 'N' ? k : m, 
+    // a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta, 
+    // c->ptr<T>(), k));
+
+    
+}
+#endif // JIT
+
+} // jittor
diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.h b/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.h
new file mode 100644
index 00000000..17652c5d
--- /dev/null
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_acc_matmul_op.h
@@ -0,0 +1,27 @@
+// ***************************************************************
+// Copyright (c) 2022 Jittor. All Rights Reserved. 
+// Maintainers: 
+//     Guoye Yang <498731903@qq.com>
+//     Dun Liang <randonlang@gmail.com>. 
+// 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "op.h"
+
+namespace jittor {
+
+struct CublasAccMatmulOp : Op {
+    Var* a, * b, * c;
+    bool trans_a, trans_b;
+    int stride_a, stride_b;
+    int offset_a, offset_b;
+    CublasAccMatmulOp(Var* a, Var* b, bool trans_a, bool trans_b, int stride_a=-1, int stride_b=-1, int offset_a=0, int offset_b=0);
+    
+    const char* name() const override { return "cublas_acc_matmul"; }
+    void infer_shape() override;
+    DECLARE_jit_run;
+};
+
+} // jittor
\ No newline at end of file
diff --git a/python/jittor/nn.py b/python/jittor/nn.py
index 4da7f3f4..a3ee5d34 100644
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@@ -809,6 +809,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,
         self.groups = groups
         self.is_depthwise_conv = self.groups == self.out_channels and self.groups == self.in_channels
         if self.is_depthwise_conv and jt.flags.use_cuda:
+            print("??????")
             self.depthwise_conv = DepthwiseConv(stride, padding, dilation)
         assert in_channels % groups == 0, 'in_channels must be divisible by groups'
         assert out_channels % groups == 0, 'out_channels must be divisible by groups'
diff --git a/python/jittor/src/profiler/profiler.cc b/python/jittor/src/profiler/profiler.cc
index fbf879cd..8761d881 100644
--- a/python/jittor/src/profiler/profiler.cc
+++ b/python/jittor/src/profiler/profiler.cc
@@ -428,7 +428,10 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
             continue;
         auto& fname = fnames[i];
         rep.push_back({name, fname});
-        ss << std::setw(w) << name;
+        if (name.size() > 100)
+            ss << std::setw(w) << name.substr(0, 100);
+        else
+            ss << std::setw(w) << name;
         if (name.size() >= w-1)
             ss << "\n" << std::setw(w) << " ";
         ss << std::setw(w) << fname;

From 15f59ace352c7e5e7abc93ebc3bb47b0c6c7bd36 Mon Sep 17 00:00:00 2001
From: Exusial <bilibili39@163.com>
Date: Tue, 17 May 2022 08:29:20 -0400
Subject: [PATCH 7/7] temp.

---
 .../cuda/cublas/ops/cublas_matmul_op.cc       | 22 +++++++++++++++++++
 .../extern/cuda/cublas/ops/cublas_matmul_op.h |  1 +
 python/jittor/optim.py                        |  3 +++
 python/jittor/src/op_compiler.cc              | 22 ++++++++++++++++---
 python/jittor/src/opt/pass_manager.cc         | 14 ++++++++----
 python/jittor/src/opt/tuner_manager.cc        | 17 +++++++++++---
 python/jittor/src/type/fp16_op_type.cc        |  8 ++++++-
 7 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
index b6dfde3c..83994fa8 100644
--- a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
@@ -11,6 +11,7 @@
 #include "var.h"
 #include "cublas_matmul_op.h"
 #include "cublas_wrapper.h"
+#include "ops/op_register.h"
 
 using namespace std;
 
@@ -49,6 +50,27 @@ void CublasMatmulOp::infer_shape() {
     c->set_shape({n, k});
 }
 
+static auto make_cublas_matmul = get_op_info("cublas_matmul")
+    .get_constructor<VarPtr, Var*, Var*, bool, bool>();
+
+VarPtr CublasMatmulOp::grad(Var* out, Var* dout, Var* v, int v_index) {
+    // a [b,n,m] b [b,m,k], c[b,n,k]
+    // c = a*b
+    if (v_index == 0) {
+        if (trans_a)
+            return make_cublas_matmul(b, dout, trans_b, 1);
+        else
+            // da = dc*b^T
+            return make_cublas_matmul(dout, b, 0, trans_b^1);
+    } else {
+        if (trans_b)
+            return make_cublas_matmul(dout, a, 1, trans_a);
+        else
+            // db = a^T*dc
+            return make_cublas_matmul(a, dout, trans_a^1, 0);
+    }
+}
+
 void CublasMatmulOp::jit_prepare(JK& jk) {
     jk << _CS("[T:") << a->dtype();
     jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.h b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.h
index 24c5d1b8..c7435d87 100644
--- a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.h
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.h
@@ -19,6 +19,7 @@ struct CublasMatmulOp : Op {
     
     const char* name() const override { return "cublas_matmul"; }
     void infer_shape() override;
+    VarPtr grad(Var* out, Var* dout, Var* v, int v_index) override;
     DECLARE_jit_run;
 };
 
diff --git a/python/jittor/optim.py b/python/jittor/optim.py
index 830378d7..86fd1925 100644
--- a/python/jittor/optim.py
+++ b/python/jittor/optim.py
@@ -211,10 +211,12 @@ def step(self, loss):
 
     def step(self, loss=None, retain_graph=False):
         self.pre_step(loss, retain_graph)
+        print("lr: ", lr)
         for pg in self.param_groups:
             lr = pg.get("lr", self.lr)
             for p, g in zip(pg["params"], pg["grads"]):
                 if p.is_stop_grad(): continue
+                print("g: ", g)
                 p.update(p - g * lr)
         self.post_step()
 
@@ -386,6 +388,7 @@ def step(self, loss=None, retain_graph=False):
                 m.update(b0 * m + (1-b0) * g)
                 v.update(b1 * v + (1-b1) * g * g)
                 step_size = lr * jt.sqrt(1-b1**n) / (1-b0 ** n)
+                # print("delta: ", m * step_size / (jt.sqrt(v) + eps))
                 p.update(p - m * step_size / (jt.sqrt(v) + eps))
         self.post_step()
 
diff --git a/python/jittor/src/op_compiler.cc b/python/jittor/src/op_compiler.cc
index 11f753aa..8b42be33 100644
--- a/python/jittor/src/op_compiler.cc
+++ b/python/jittor/src/op_compiler.cc
@@ -731,11 +731,15 @@ string OpCompiler::get_jit_src(Op* op) {
     string name = op->name();
     string name2 = Op::op_name_to_file_name(name);
     string name3 = Op::file_name_to_class_name(name2);
+
     if (name == "fused") {
         string src = get_fused_src((FusedOp*)op);
+        // if (src.find("define op4_index_t int32") != string::npos)
+        //     LOGir << "here here here" << src; 
         ASSERT(src.size());
         return src;
     }
+    // LOGir << "name: " << " " << name << " " << name2 << " " << name3;
     auto op_info = get_op_info(name);
     auto& src_path = op_info.source_path;
     
@@ -759,15 +763,17 @@ string OpCompiler::get_jit_src(Op* op) {
         else
             after_include_src += src;
     }
+    // if (src.find("define op4_index_t int32") != string::npos)
+    //     LOGir << "here here here" << src; 
     ASSERT(file_exist(_to_winstr(src_path))) << src_path;
     LOGvvv << "Read from" << src_path; 
     string src = read_all(_to_winstr(src_path));
     ASSERT(src.size()) << "Source read failed:" << src_path;
-
     unordered_map<string,string> defs(jit_define.begin(), jit_define.end());
     LOGvvv << "Precompile with key:" << defs;
     src = precompile(defs, src);
-
+    // if (src.find("define op4_index_t int32") != string::npos)
+    //     LOGir << "?????" << src; 
     // find the last occur of #include "..."\n
     auto pos = src.rfind("#include");
     if (pos == string::npos) pos=0;
@@ -812,6 +818,7 @@ string OpCompiler::get_fused_src(FusedOp* op) {
         }
         Op* opi = op->ops[oi];
         string src = get_jit_src(opi);
+        // LOGir << "@@@: " << src;
         op_srcs.push_back(move(src));
     }
     return OpCompiler::__get_fused_src(op->ops, op_srcs, op_members);
@@ -865,7 +872,7 @@ string OpCompiler::__get_fused_src(
         "int", "float", "bool", "CHECK", "STRINGIZE",
         "void", "__restrict__", "if", "true", "false",
         "Op", "Var", "Node", "itof", "assert", "ASSERT",
-        "float64"
+        "float64", "float16"
     };
     auto not_change = [&](const string& s) -> bool {
         if (unchanged.count(s)) return true;
@@ -914,6 +921,7 @@ string OpCompiler::__get_fused_src(
         std::regex_match(src, cm, e);
         ASSERT(cm.size()>=2) << src;
         string name3 = cm[1];
+
         for (uint i=0; i<src.size(); i++) {
             if (src[i] == '#' &&
                 (i+1<src.size() && src[i+1] == 'i') &&
@@ -1078,15 +1086,23 @@ jit_op_entry_t OpCompiler::do_compile(Op* op) {
     jittor::lock_guard lg;
     OpCompiler oc(op);
     string* src = &oc.src;
+    // if (src->find("ops/broadcast_to_op.h") != string::npos && src->find("<limits>") == string::npos && src->find("op4_Tx") != string::npos && src->find("op9_Tx") == string::npos)
+    //     LOGir << "!!!!" << *src;
     for (auto op_type : op_types)
         op_type->post_pass(&oc);
     string src_after_passes;
+    // if (src->find("ops/broadcast_to_op.h") != string::npos && src->find("<limits>") == string::npos && src->find("op6_Tx") != string::npos)
+    //     LOGir << "????" << *src;
     // if is fused op
     if (oc.op) {
+        // LOGir << "?????????????  " << *src;
         TunerManager tm(&oc);
         src_after_passes = tm.tune();
         src = &src_after_passes;
     }
+    // if (src->find("ops/broadcast_to_op.h") != string::npos && src->find("<limits>") == string::npos && src->find("op4_Tx") != string::npos && src->find("op9_Tx") == string::npos)
+    // if (src->find("58aff3bc47edee4")!=string::npos)
+    //     LOGir << "1233243423  " << *src << oc.op;
     op->compile_optimize(*src);
     auto ret = oc.compile(op->get_jit_key(get_jk()), *src);
     return ret;
diff --git a/python/jittor/src/opt/pass_manager.cc b/python/jittor/src/opt/pass_manager.cc
index 07c3eebf..534154ef 100644
--- a/python/jittor/src/opt/pass_manager.cc
+++ b/python/jittor/src/opt/pass_manager.cc
@@ -63,8 +63,10 @@ bool PassManager::check(Pass* pass) {
 }
 
 void PassManager::run_passes() {
+    if(oc->op->get_hash_name() == "58aff3bc47edee4")
+        LOGir << "hahaha1" << all.to_string();
     auto& ir = *main_ir;
-
+    // LOGir << all.to_string();
     LOGvvvv << "KernelIR:\n" << ir.to_string();
     if (oc->op->ops.size() == 1 && oc->op->ops[0]->name() == string("array")) {
         ir.remove_all_unused();
@@ -77,6 +79,7 @@ void PassManager::run_passes() {
         }
         return;
     }
+
     run_pass<MarkRawPass>();
     run_pass<ReplaceForNumPass>();
     run_pass<LoopVarAnalyzePass>();
@@ -96,7 +99,6 @@ void PassManager::run_passes() {
     run_pass<MergeLoopVarPass>();
     // tmp disable ConstVarPass
     // run_pass<ConstVarPass>();
-
     run_pass<RestridePass>();
     
     if (cc_type == "icc") {
@@ -106,17 +108,21 @@ void PassManager::run_passes() {
         run_pass<UnrollPass>();
     }
     run_pass<UseMovntPass>();
+
     run_pass<CheckCachePass>();
+
     run_pass<LoopToFuncPass>();
+    // if(all.to_string().find("58aff3bc47edee4") != string::npos)
+    //     LOGir << "hahaha1" << all.to_string();
     run_pass<AssumeAlignedPass>();
+
     run_pass<ParallelPass>();
     run_pass<AtomicTunerPass>();
+
     run_pass<FloatAtomicFixPass>();
-    
     run_pass<InsertProfileLoopPass>();
     
     run_pass<SolveConflictDefinePass>();
-    
     run_pass<FakeMainPass>();
 }
 
diff --git a/python/jittor/src/opt/tuner_manager.cc b/python/jittor/src/opt/tuner_manager.cc
index b6e01207..84a12b0f 100644
--- a/python/jittor/src/opt/tuner_manager.cc
+++ b/python/jittor/src/opt/tuner_manager.cc
@@ -43,20 +43,31 @@ string TunerManager::tune() {
     run_tuner<ReduceTuner>(&pm);
     run_tuner<MatmulTuner>(&pm);
     run_tuner<ConvTuner>(&pm);
-
+    // if (pm.all.to_string().find("define op4_index_t int32") != string::npos)
+    //     LOGir << "?????" << pm.all.to_string();
     // use the best tuner if it is confidence enough
     if (best_tuner && best_tuner->confidence) {
-        if (jit_search_kernel)
+        
+        if (jit_search_kernel) {
+            // LOGir << "?????" << pm.all.to_string();
             searcher.search(best_tuner->candidates);
+        }
         else {
             if (best_tuner->confidence >= 10) {
                 auto& loop_options = oc->op->get_loop_options_tuned();
                 for (auto& kv : best_tuner->candidates)
                     loop_options[kv.first] = kv.second.front();
                 oc->op->update_jit_key();
-                PassManager pm(oc);
+                // if (oc->get_src().find("define op4_index_t int32") != string::npos)
+                //     LOGir << "?????" << oc->get_src(); 
+                string* src = &oc->src; 
+                PassManager pm(oc); 
+                // if (pm.all.to_string().find("define op4_index_t int32") != string::npos)
+                //     LOGir << "?????" << pm.all.to_string();  
                 pm.run_passes();
                 src_after_passes = pm.all.to_string();
+                // if (pm.all.to_string().find("58aff3bc47edee4") != string::npos)
+                //     LOGir << "?????" << pm.all.to_string();
             }
         }
     }
diff --git a/python/jittor/src/type/fp16_op_type.cc b/python/jittor/src/type/fp16_op_type.cc
index f12a5e4f..851d7217 100644
--- a/python/jittor/src/type/fp16_op_type.cc
+++ b/python/jittor/src/type/fp16_op_type.cc
@@ -8,6 +8,7 @@
 #include "utils/str_utils.h"
 #include "ops/op_register.h"
 #include "op_compiler.h"
+#include "fused_op.h"
 
 namespace jittor {
 
@@ -35,7 +36,7 @@ struct FP16OpType : OpByType {
             {"logical_not", "(!($2))"},
             {"bitwise_not", "(~($2))"},
             {"negative", "(-($2))"},
-            {"abs", "::abs($2)"},
+            {"abs", "::__habs($2)"},
             {"log", "::hlog(($1)($2))"},
             {"exp", "::hexp(($1)($2))"},
             {"sqrt", "::hsqrt(($1)($2))"},
@@ -171,6 +172,10 @@ struct FP16OpType : OpByType {
 
     void post_pass(OpCompiler* oc) {
         string& src = oc->src;
+        if(oc->op && oc->op->get_hash_name() == "58aff3bc47edee4")
+            LOGir << src;
+        // if (src.find("ops/broadcast_to_op.h") != string::npos && src.find("<limits>") == string::npos)
+        //     LOGir << "sssss " << src;
         if (src.find("float16") == string::npos)
             return;
         int i = src.rfind("#include");
@@ -178,6 +183,7 @@ struct FP16OpType : OpByType {
         i = src.find('\n', i) + 1;
         src = src.substr(0, i) + "#include \"type/fp16_compute.h\"\n" + 
             src.substr(i);
+    // LOGir << "eeeee " << src;
         return;
     }
 };