From cbbba151e5e71e0a0cec64e6ec84fe5c7d2de893 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Wed, 19 Jun 2024 15:43:16 +0800
Subject: [PATCH 01/29] temp

---
 env.sh | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 env.sh

diff --git a/env.sh b/env.sh
new file mode 100644
index 0000000000..fccbb02918
--- /dev/null
+++ b/env.sh
@@ -0,0 +1,7 @@
+cd build/
+export BUDDY_MLIR_BUILD_DIR=$PWD
+export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
+
+export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/
+cd ../
\ No newline at end of file

From 0858f29501b321372059431672e3026effd5dc02 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Wed, 19 Jun 2024 20:21:22 +0800
Subject: [PATCH 02/29] fix/maxpool2d_simplify

---
 frontend/Python/graph/operation.py                |  9 +++++++++
 .../graph/transform/useless_op_eliminate.py       | 15 +++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
index 14bfbf2752..6b543224e5 100644
--- a/frontend/Python/graph/operation.py
+++ b/frontend/Python/graph/operation.py
@@ -124,10 +124,19 @@ def args(self):
     @property
     def kwargs(self):
         return self._keyword_arguments
+    
+    @property
+    def parents(self):
+        return self._parents
+
+    @property
+    def children(self):
+        return self._children
 
     @property
     def name(self):
         return self._name
+    
     @name.setter
     def name(self, new_name):
         self._name = new_name
diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py
index a99dbe02c6..2522e17984 100644
--- a/frontend/Python/graph/transform/useless_op_eliminate.py
+++ b/frontend/Python/graph/transform/useless_op_eliminate.py
@@ -42,13 +42,24 @@ def maxpool2d_simplify(graph: Graph):
                 and getitem_node.args[1] == 0
             ):
                 new_node = MaxPool2dOp()
-                new_node.name = getitem_node.name
+                new_node.name = node.name.replace("_with_indices", "")
                 for arg in node.args:
                     new_node.add_argument(arg)
                 for parent in node._parents:
                     new_node.add_parent(parent)
+                    parent_node = graph.node_table[parent]
+                    for cindex, child in enumerate(parent_node.children):
+                        if child == node.name:
+                            parent_node.children[cindex] = new_node.name
                 for child in getitem_node._children:
                     new_node.add_children(child)
+                    child_node = graph.node_table[child]
+                    for pindex, parent in enumerate(child_node.parents):
+                        if parent == getitem_node.name:
+                            child_node.parents[pindex] = new_node.name
+                    for aindex, arg in enumerate(child_node.args):
+                        if arg == getitem_node.name:
+                            child_node.args[aindex] = new_node.name
                 new_node.tensor_meta["shape"] = getitem_node.tensor_meta[
                     "shape"
                 ]
@@ -63,4 +74,4 @@ def maxpool2d_simplify(graph: Graph):
                 for j, op in enumerate(graph.body):
                     if op == getitem_node:
                         graph.body[j] = new_node
-                        break
+                        break
\ No newline at end of file

From f2fd5720a8f636636346eb37e85b9ec04b1915cb Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Wed, 19 Jun 2024 20:22:42 +0800
Subject: [PATCH 03/29] fix/maxpool2d_simplify

---
 env.sh | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 env.sh

diff --git a/env.sh b/env.sh
deleted file mode 100644
index fccbb02918..0000000000
--- a/env.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-cd build/
-export BUDDY_MLIR_BUILD_DIR=$PWD
-export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
-export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
-
-export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/
-cd ../
\ No newline at end of file

From b2c4c29128a342196551f5889cc9388d3eb8f010 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Thu, 20 Jun 2024 16:04:14 +0800
Subject: [PATCH 04/29] add json_encoder and json_decoder

---
 examples/BuddyLeNet/buddy-lenet-import.py     | 16 ++++
 examples/BuddyLeNet/graph.dot                 | 56 +++++++++++
 examples/BuddyLeNet/lenet.json                |  1 +
 frontend/Python/graph/graph.py                | 83 +++++++++++++++++
 frontend/Python/graph/json_decoder.py         | 93 +++++++++++++++++++
 frontend/Python/graph/operation.py            |  8 ++
 .../graph/transform/useless_op_eliminate.py   | 15 ++-
 7 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 examples/BuddyLeNet/graph.dot
 create mode 100644 examples/BuddyLeNet/lenet.json
 create mode 100644 frontend/Python/graph/json_decoder.py

diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 95e76de253..76fcb32cf0 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -29,6 +29,7 @@
 from buddy.compiler.graph import GraphDriver
 from buddy.compiler.graph.transform import simply_fuse
 from buddy.compiler.ops import tosa
+from buddy.compiler.graph.json_decoder import json_to_graph
 from model import LeNet
 
 # Retrieve the LeNet model path from environment variables.
@@ -74,3 +75,18 @@
 )
 
 float32_param.tofile(Path(current_path) / "arg0.data")
+
+# Convert the lenet graph to JSON string
+json_str = graph.to_json()
+with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file:
+    module_file.write(json_str)
+
+# Convert the lenet graph Json string to a lenet graph
+graph0 = json_to_graph(json_str)
+graph0.lower_to_top_level_ir()
+print(graph0._imported_module)
+
+# Convert the lenet graph to DOT string
+dot_str = graph.to_dot()
+with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file:
+    module_file.write(dot_str)
\ No newline at end of file
diff --git a/examples/BuddyLeNet/graph.dot b/examples/BuddyLeNet/graph.dot
new file mode 100644
index 0000000000..04313d9e35
--- /dev/null
+++ b/examples/BuddyLeNet/graph.dot
@@ -0,0 +1,56 @@
+// Buddy Graph
+digraph {
+	arg0_1 -> convolution
+	arg1_1 -> convolution
+	arg2_1 -> convolution_1
+	arg3_1 -> convolution_1
+	arg4_1 -> permute
+	arg5_1 -> addmm
+	arg6_1 -> permute_1
+	arg7_1 -> addmm_1
+	arg8_1 -> permute_2
+	arg9_1 -> addmm_2
+	arg10_1 -> convolution
+	convolution -> relu
+	relu -> max_pool2d
+	max_pool2d -> convolution_1
+	convolution_1 -> relu_1
+	relu_1 -> max_pool2d_1
+	max_pool2d_1 -> view
+	view -> addmm
+	permute -> addmm
+	addmm -> relu_2
+	relu_2 -> addmm_1
+	permute_1 -> addmm_1
+	addmm_1 -> relu_3
+	relu_3 -> addmm_2
+	permute_2 -> addmm_2
+	addmm_2 -> output
+	arg0_1 [fillcolor=white shape=ellipse style=filled]
+	arg1_1 [fillcolor=white shape=ellipse style=filled]
+	arg2_1 [fillcolor=white shape=ellipse style=filled]
+	arg3_1 [fillcolor=white shape=ellipse style=filled]
+	arg4_1 [fillcolor=white shape=ellipse style=filled]
+	arg5_1 [fillcolor=white shape=ellipse style=filled]
+	arg6_1 [fillcolor=white shape=ellipse style=filled]
+	arg7_1 [fillcolor=white shape=ellipse style=filled]
+	arg8_1 [fillcolor=white shape=ellipse style=filled]
+	arg9_1 [fillcolor=white shape=ellipse style=filled]
+	arg10_1 [fillcolor=white shape=ellipse style=filled]
+	convolution [fillcolor=deepskyblue shape=box style=filled]
+	relu [fillcolor=deepskyblue shape=box style=filled]
+	max_pool2d [fillcolor=red shape=box style=filled]
+	convolution_1 [fillcolor=deepskyblue shape=box style=filled]
+	relu_1 [fillcolor=deepskyblue shape=box style=filled]
+	max_pool2d_1 [fillcolor=red shape=box style=filled]
+	view [fillcolor=deepskyblue shape=box style=filled]
+	permute [fillcolor=deepskyblue shape=box style=filled]
+	addmm [fillcolor=deepskyblue shape=box style=filled]
+	relu_2 [fillcolor=deepskyblue shape=box style=filled]
+	permute_1 [fillcolor=deepskyblue shape=box style=filled]
+	addmm_1 [fillcolor=deepskyblue shape=box style=filled]
+	relu_3 [fillcolor=deepskyblue shape=box style=filled]
+	permute_2 [fillcolor=deepskyblue shape=box style=filled]
+	addmm_2 [fillcolor=deepskyblue shape=box style=filled]
+	output [fillcolor=white shape=ellipse style=filled]
+}
diff --git a/examples/BuddyLeNet/lenet.json b/examples/BuddyLeNet/lenet.json
new file mode 100644
index 0000000000..11171f91ac
--- /dev/null
+++ b/examples/BuddyLeNet/lenet.json
@@ -0,0 +1 @@
+{"graph_name": "forward", "nodes": [{"name": "arg0_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6, 1, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg1_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg2_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16, 6, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg3_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg4_1", "children": ["permute"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 256], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg5_1", "children": ["addmm"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg6_1", "children": ["permute_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg7_1", "children": ["addmm_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg8_1", "children": ["permute_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10, 84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg9_1", "children": ["addmm_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg10_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 1, 28, 28], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}], "device": "cpu", "params": [{"shape": [6, 1, 5, 5], "dtype": "Float32"}, {"shape": [6], "dtype": "Float32"}, {"shape": [16, 6, 5, 5], "dtype": "Float32"}, {"shape": [16], "dtype": "Float32"}, {"shape": [120, 256], "dtype": "Float32"}, {"shape": [120], "dtype": "Float32"}, {"shape": [84, 120], "dtype": "Float32"}, {"shape": [84], "dtype": "Float32"}, {"shape": [10, 84], "dtype": "Float32"}, {"shape": [10], "dtype": "Float32"}], "inputs": [{"shape": [1, 1, 28, 28], "dtype": "Float32"}], "subgraphs": {"subgraph0": [{"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}]}, "subgraph_map_device": {"subgraph0": "UNKNOW"}}
\ No newline at end of file
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index eb78c0ff33..898f967b63 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -23,6 +23,8 @@
 import ctypes
 import functools
 import numpy as np
+import graphviz
+import json
 
 import mlir.ir as ir
 import mlir.dialects.func as func
@@ -324,6 +326,87 @@ def compile(self):
         self.lower_to_top_level_ir()
         self.lower_to_llvm_ir()
 
+    def to_dot(self):
+        """
+        Converts a buddy graph to a DOT string for visualization.
+
+        Returns:
+            str: A DOT string representing the buddy graph for visualization.
+        """
+        dot = graphviz.Digraph(comment='Buddy Graph')
+        for op in self._body:
+            # if isinstance(op, PlaceholderOp):
+            #     continue
+            for child in op._children:
+                dot.edge(op._name, child)
+        for op in self._body:
+            if isinstance(op, PlaceholderOp):
+                dot.node(op._name, shape="ellipse", fillcolor="white", style="filled")
+                # continue
+            elif isinstance(op, OutputOp):
+                dot.node(op._name, shape="ellipse", fillcolor="white", style="filled")
+            elif isinstance(op, MaxPool2dOp):
+                dot.node(op._name, shape="box", fillcolor="red", style="filled")
+            else:
+                dot.node(op._name, shape="box", fillcolor="deepskyblue", style="filled")
+        return str(dot)
+
+    def to_json(self):
+        """
+        Converts a buddy graph to a JSON string.
+
+        Returns:
+            str: A JSON string representing the buddy graph.
+        """
+        json_str = json.dumps(self, cls=BuddyGraphEncoder)
+        return json_str
+    
+
+class BuddyGraphEncoder(json.JSONEncoder):
+    """
+    Custom JSON encoder for converting Buddy Graph objects to JSON strings.
+
+    This encoder handles encoding of Graph, Op, TensorMeta, OpType, TensorDType,
+    and DeviceType objects to their JSON representation.
+
+    Returns:
+        JSONEncoder: A JSON encoder instance for Buddy Graph objects.
+    """
+    def default(self, obj):
+        if isinstance(obj, Graph):
+            return {
+                'graph_name' : obj._func_name,
+                'nodes' : obj._body,
+                'device' : obj.device,
+                'params' : obj._fake_params,
+                'inputs' : obj._inputs,
+                'subgraphs' : obj.op_groups,
+                'subgraph_map_device' : obj.group_map_device
+            }
+        elif isinstance(obj, Op):
+            return {
+                'name' : obj._name,
+                'children' : obj._children,
+                'parents' : obj._parents,
+                'arguments' : obj._arguments,
+                'keyword_arguments' : obj._keyword_arguments,
+                'tensor_meta' : obj._tensor_meta,
+                'type' : obj._op_type,
+                'class' : obj.__class__.__name__
+            }
+        elif isinstance(obj, TensorMeta):
+            return {
+                'shape' : obj.shape,
+                'dtype' : obj.dtype
+            }
+        elif isinstance(obj, OpType):
+            return obj._name_
+        elif isinstance(obj, TensorDType):
+            return obj._name_
+        elif isinstance(obj, DeviceType):
+            return obj._name_
+        else:
+            return super().default(obj)
 
 class GraphImporter:
     """
diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py
new file mode 100644
index 0000000000..70e5112c32
--- /dev/null
+++ b/frontend/Python/graph/json_decoder.py
@@ -0,0 +1,93 @@
+import json
+from pathlib import Path
+
+from .graph import Graph, TensorDType, TensorMeta
+from .graph_driver import GraphDriver
+from .operation import *
+from .type import *
+
+from ..ops.linalg import ops_registry as linalg_ops_registry
+from ..ops.tosa import ops_registry as tosa_ops_registry
+from ..ops.math import ops_registry as math_ops_registry
+from ..ops.func import ops_registry as func_ops_registry
+
+def json_to_graph(json_str):
+    """
+    Converts a buddy graph JSON string to a Graph object.
+
+    Args:
+        json_str (str): The JSON string representing the buddy graph.
+
+    Returns:
+        Graph: The Graph object created from the JSON data.
+    """
+    def json_to_tensormeta(json_data):
+        """
+        Convert JSON data to a TensorMeta object.
+
+        Args:
+            json_data (dict): JSON data representing a TensorMeta object.
+
+        Returns:
+            TensorMeta: The TensorMeta object created from the JSON data.
+        """
+        if 'shape' in json_data:
+            shape = json_data['shape']
+            dtype = next((member for member in TensorDType.__members__.values() if member.value.upper() == json_data['dtype'].upper()), None)
+            return TensorMeta(shape, dtype)
+        return {}
+        
+    json_data = json.loads(json_str)
+    _graph = json_data
+    graph_name = _graph['graph_name'] 
+    inputs = []
+    params = []
+    for _input in _graph['inputs']:
+        inputs.append(json_to_tensormeta(_input))
+    for _param in _graph['params']:
+        params.append(json_to_tensormeta(_param))
+    ops_registry = {}
+    ops_registry.update(func_ops_registry)
+    ops_registry.update(linalg_ops_registry)
+    ops_registry.update(tosa_ops_registry)
+    ops_registry.update(math_ops_registry)
+    graph = Graph(inputs, params, ops_registry, graph_name)
+    graph.device = _graph['device']
+    for _node in _graph['nodes']:
+        op_class = _node['class']
+        op = globals()[op_class]()
+
+        op._name = _node['name']
+        op._children = _node['children']
+        op._parents = _node['parents']
+        op._arguments = _node['arguments']
+        op._keyword_arguments = _node['keyword_arguments']
+        op._type = next((member for member in OpType.__members__.values() if member.value == _node['type']), None)
+
+        # TODO : node attr tensor_meta should be  Class TensorMeta
+        if ('shape' not in _node['tensor_meta']):
+            op._tensor_meta = _node['tensor_meta']
+        else:
+            op._tensor_meta = {
+                'shape' : _node['tensor_meta']['shape'],
+                'dtype' : next((member for member in TensorDType.__members__.values() if member.value.upper() == _node['tensor_meta']['dtype'].upper()), None)
+            }
+        graph.add_node(op)
+
+    for subgraph_name, subgraph_body in _graph['subgraphs'].items():
+        subgraph_ops = []
+        for subgraph_node in subgraph_body:
+            op_name = subgraph_node['name']
+            op = graph.node_table[op_name]
+            subgraph_ops.append(op)
+        graph.op_groups[subgraph_name] = subgraph_ops
+    
+    for subgraph_name, subgraph_device in _graph['subgraph_map_device'].items():
+        if subgraph_device == 'CPU':
+            graph.group_map_device[subgraph_name] = DeviceType.CPU
+        elif subgraph_device == 'GPU':
+            graph.group_map_device[subgraph_name] = DeviceType.GPU
+        else:
+            graph.group_map_device[subgraph_name] = DeviceType.UNKNOW
+
+    return graph
diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
index 14bfbf2752..7632397bd5 100644
--- a/frontend/Python/graph/operation.py
+++ b/frontend/Python/graph/operation.py
@@ -124,6 +124,14 @@ def args(self):
     @property
     def kwargs(self):
         return self._keyword_arguments
+    
+    @property
+    def parents(self):
+        return self._parents
+
+    @property
+    def children(self):
+        return self._children
 
     @property
     def name(self):
diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py
index a99dbe02c6..2522e17984 100644
--- a/frontend/Python/graph/transform/useless_op_eliminate.py
+++ b/frontend/Python/graph/transform/useless_op_eliminate.py
@@ -42,13 +42,24 @@ def maxpool2d_simplify(graph: Graph):
                 and getitem_node.args[1] == 0
             ):
                 new_node = MaxPool2dOp()
-                new_node.name = getitem_node.name
+                new_node.name = node.name.replace("_with_indices", "")
                 for arg in node.args:
                     new_node.add_argument(arg)
                 for parent in node._parents:
                     new_node.add_parent(parent)
+                    parent_node = graph.node_table[parent]
+                    for cindex, child in enumerate(parent_node.children):
+                        if child == node.name:
+                            parent_node.children[cindex] = new_node.name
                 for child in getitem_node._children:
                     new_node.add_children(child)
+                    child_node = graph.node_table[child]
+                    for pindex, parent in enumerate(child_node.parents):
+                        if parent == getitem_node.name:
+                            child_node.parents[pindex] = new_node.name
+                    for aindex, arg in enumerate(child_node.args):
+                        if arg == getitem_node.name:
+                            child_node.args[aindex] = new_node.name
                 new_node.tensor_meta["shape"] = getitem_node.tensor_meta[
                     "shape"
                 ]
@@ -63,4 +74,4 @@ def maxpool2d_simplify(graph: Graph):
                 for j, op in enumerate(graph.body):
                     if op == getitem_node:
                         graph.body[j] = new_node
-                        break
+                        break
\ No newline at end of file

From d09dd7370f7cd6e69723bfe1055ec3cd94c287f5 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Thu, 20 Jun 2024 16:05:38 +0800
Subject: [PATCH 05/29] add json_encoder and json_decoder

---
 env.sh | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 env.sh

diff --git a/env.sh b/env.sh
deleted file mode 100644
index fccbb02918..0000000000
--- a/env.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-cd build/
-export BUDDY_MLIR_BUILD_DIR=$PWD
-export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
-export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
-
-export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/
-cd ../
\ No newline at end of file

From 4e779240ea549758e284d3ed7db8510894b4dd0c Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Wed, 31 Jul 2024 07:35:01 +0000
Subject: [PATCH 06/29] add gpu.container_module

---
 frontend/Python/graph/graph.py |  6 ++-
 frontend/Python/ops/gpu.py     | 97 ++++++++++++++++++++++++++++++++++
 frontend/Python/ops/utils.py   |  2 +
 3 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 frontend/Python/ops/gpu.py

diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index eb78c0ff33..ea71a925c7 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -361,6 +361,7 @@ def __init__(
             ops_registry = {}
         self._symbol_table = {}
         self._body = body
+        self._device = DeviceType.GPU
         self._func_name = func_name
         self._params = params
         self._inputs = inputs
@@ -440,7 +441,7 @@ def import_graph(self) -> ir.Module:
                 shape_list = list(arg.shape)
                 dtype = arg.dtype
                 mlir_dtype = self._str_to_mlir_dtype(dtype)
-                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
+                tensor_arg = ir.MemrefType.get(shape_list, mlir_dtype)
                 arguments.append(tensor_arg)
             extern_func = []
             for node in self._body:
@@ -473,6 +474,9 @@ def generated_func(*args):
                         self._import_op(node)
 
                 return self._symbol_table.get(("output", 0))
+        
+        if self._device == DeviceType.GPU:
+            self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get()
 
         return self._module
 
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
new file mode 100644
index 0000000000..e0d02ab492
--- /dev/null
+++ b/frontend/Python/ops/gpu.py
@@ -0,0 +1,97 @@
+# ===- func.py -----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# The registry of mappings from Buddy node to MLIR GPU kernel.
+#
+# ===---------------------------------------------------------------------------
+
+
+from typing import Tuple
+import mlir.ir as ir
+from mlir.dialects import gpu, memref, arith, scf
+
+from ..graph import TensorDType
+from ..graph import (
+    ReluOp
+)
+from .utils import *
+
+def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
+    """
+    Import the buddy ReluOp.
+    From Buddy ReluOp to MLIR Relu GPU kernel.
+    """
+    assert len(node.args) == 1
+    input = symbol_table.get((str(node.args[0]), 0))
+    if input is None:
+        return
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, 0)
+    memref_type = ir.MemrefType.get(output_shape, element.type)
+    unranked_memref_type = ir.UnrankedMemRefType.get(dtype, ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    input_cast = memref.CastOp(unranked_memref_type, input)
+    gpu.HostRegisterOp(input_cast)
+
+    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
+    c512 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
+    size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1024))
+
+    gpu_kernel = gpu.LaunchOp(
+        asyncToken=None,
+        asyncDependencies=[],
+        gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result,
+        blockSizeX=c512.result, blockSizeY=c1.result, blockSizeZ=c1.result,
+    )
+    # Create a GPU kernel block and define grid and block dimensions for GPU execution
+    gpu_kernel_block = ir.Block.create_at_start(
+        gpu_kernel.body,
+        [
+            ir.IndexType.get(),  # %bx : index, Block index X
+            ir.IndexType.get(),  # %by : index, Block index Y
+            ir.IndexType.get(),  # %bz : index, Block index Z
+            ir.IndexType.get(),  # %tx : index, Thread index X
+            ir.IndexType.get(),  # %ty : index, Thread index Y
+            ir.IndexType.get(),  # %tz : index, Thread index Z
+            ir.IndexType.get(),  # %num_bx : index, Grid size X
+            ir.IndexType.get(),  # %num_by : index, Grid size Y
+            ir.IndexType.get(),  # %num_bz : index, Grid size Z
+            ir.IndexType.get(),  # %num_tx : index, Block size X
+            ir.IndexType.get(),  # %num_ty : index, Block size Y
+            ir.IndexType.get(),  # %num_tz : index, Block size Z
+        ]
+    )
+
+    with ir.InsertionPoint(gpu_kernel_block):
+        tIdX = gpu_kernel_block.arguments[3]
+        cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0))
+        for1 = scf.ForOp(
+            lower_bound=tIdX,
+            upper_bound=size,
+            step=gpu_kernel.blockSizeX
+        )
+        with ir.InsertionPoint(for1.body):
+            load = memref.LoadOp(arg0, [for1.induction_variable])
+            result = arith.MaxNumFOp(load, cst_0)
+            memref.StoreOp(result, arg0, [for1.induction_variable])
+            scf.YieldOp([])
+        
+        gpu.TerminatorOp()
+    return op
+
+ops_registry = {
+    ReluOp: relu_op
+}
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
index 337f5a6b49..1217c6af2c 100644
--- a/frontend/Python/ops/utils.py
+++ b/frontend/Python/ops/utils.py
@@ -54,3 +54,5 @@ def mlir_element_attr_get(type_name, value):
         case TensorDType.Bool:
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
 
+
+def tensor_shape_size()
\ No newline at end of file

From 43b36243519e1fd5e98fa9e92a61b80ea1535685 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Mon, 12 Aug 2024 09:09:49 +0000
Subject: [PATCH 07/29] [frontend] Add GPU MLIR lowering path with ReLU
 operation support

---
 examples/BuddyTest/.gitignore     |   2 +
 examples/BuddyTest/import-test.py |  55 ++++++++++++
 examples/BuddyTest/model.py       |  30 +++++++
 frontend/Python/graph/graph.py    |   5 +-
 frontend/Python/ops/gpu.py        | 140 ++++++++++++++++++++++++------
 frontend/Python/ops/utils.py      |  43 ++++++++-
 6 files changed, 244 insertions(+), 31 deletions(-)
 create mode 100644 examples/BuddyTest/.gitignore
 create mode 100644 examples/BuddyTest/import-test.py
 create mode 100644 examples/BuddyTest/model.py

diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore
new file mode 100644
index 0000000000..6e9797bbe9
--- /dev/null
+++ b/examples/BuddyTest/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+forward.mlir
diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
new file mode 100644
index 0000000000..b47bba9b21
--- /dev/null
+++ b/examples/BuddyTest/import-test.py
@@ -0,0 +1,55 @@
+# ===- buddy-lenet-import.py ---------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the Test model AOT importer.
+#
+# ===---------------------------------------------------------------------------
+
+import os
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.graph import GraphDriver
+from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.ops.gpu import ops_registry as gpu_ops_registry
+from model import TestModule
+
+model = TestModule()
+model = model.eval()
+
+# Initialize Dynamo Compiler with specific configurations as an importer.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=gpu_ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+data = torch.randn([1, 1, 28, 28])
+# Import the model into MLIR module and parameters.
+with torch.no_grad():
+    graphs = dynamo_compiler.importer(model, data)
+
+assert len(graphs) == 1
+graph = graphs[0]
+print(graph._body)
+graph.lower_to_top_level_ir()
+path_prefix = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
+    print(graph._imported_module, file=module_file)
+    
\ No newline at end of file
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
new file mode 100644
index 0000000000..67f3bfdafd
--- /dev/null
+++ b/examples/BuddyTest/model.py
@@ -0,0 +1,30 @@
+# ===- model.py ----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# Test model definition.
+#
+# ===---------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+class TestModule(nn.Module):
+    def __init__(self):
+        super(TestModule, self).__init__()
+
+    def forward(self, x):
+        x = torch.relu(x.view(2, 14, 28))
+        return x.permute([1, 2, 0])
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index ea71a925c7..7c99b4391d 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -244,7 +244,7 @@ def lower_to_top_level_ir(self):
         output_ranks = []
         output_dtypes = []
         for out_node in outputs:
-            out_type = ir.RankedTensorType(out_node.type)
+            out_type = ir.MemRefType(out_node.type)
             shape = list(out_type.shape)
             dtype = out_type.element_type
             match str(dtype):
@@ -441,7 +441,7 @@ def import_graph(self) -> ir.Module:
                 shape_list = list(arg.shape)
                 dtype = arg.dtype
                 mlir_dtype = self._str_to_mlir_dtype(dtype)
-                tensor_arg = ir.MemrefType.get(shape_list, mlir_dtype)
+                tensor_arg = ir.MemRefType.get(shape_list, mlir_dtype)
                 arguments.append(tensor_arg)
             extern_func = []
             for node in self._body:
@@ -588,6 +588,7 @@ def _import_op(self, node: Op):
             node (Op): The buddy node representing the operation.
 
         """
+        
         op_name = node.__class__.__name__
         op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
             self._ops_registry[op_name](node, self._symbol_table)
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index e0d02ab492..2eff78f97d 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -1,4 +1,4 @@
-# ===- func.py -----------------------------------------------------------------
+# ===- gpu.py -----------------------------------------------------------------
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,9 @@
 
 from ..graph import TensorDType
 from ..graph import (
-    ReluOp
+    ReluOp,
+    ReshapeOp,
+    PermuteOp
 )
 from .utils import *
 
@@ -40,58 +42,140 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
         return
     output_shape = list(node.tensor_meta["shape"])
     dtype = node.tensor_meta["dtype"]
-    element = mlir_element_attr_get(dtype, 0)
-    memref_type = ir.MemrefType.get(output_shape, element.type)
-    unranked_memref_type = ir.UnrankedMemRefType.get(dtype, ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    input_cast = memref.CastOp(unranked_memref_type, input)
-    gpu.HostRegisterOp(input_cast)
+    element_type = mlir_element_type_get(dtype)
 
+    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
     c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
-    c512 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
-    size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1024))
+    kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
 
+    # Flatten the input into a one-dimensional format 
+    output_size = tensor_shape_size(output_shape)
+    size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size))
+    shape = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
+    memref.StoreOp(size, shape, [c0])
+    memref_reshape_type = ir.MemRefType.get([output_size], element_type)
+    input_reshape = memref.ReshapeOp(memref_reshape_type, input, shape)
+
+    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    input_cast = memref.CastOp(unranked_memref_type, input)
+    gpu.HostRegisterOp(input_cast)
     gpu_kernel = gpu.LaunchOp(
         asyncToken=None,
         asyncDependencies=[],
         gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result,
-        blockSizeX=c512.result, blockSizeY=c1.result, blockSizeZ=c1.result,
+        blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result,
     )
-    # Create a GPU kernel block and define grid and block dimensions for GPU execution
     gpu_kernel_block = ir.Block.create_at_start(
         gpu_kernel.body,
         [
-            ir.IndexType.get(),  # %bx : index, Block index X
-            ir.IndexType.get(),  # %by : index, Block index Y
-            ir.IndexType.get(),  # %bz : index, Block index Z
-            ir.IndexType.get(),  # %tx : index, Thread index X
-            ir.IndexType.get(),  # %ty : index, Thread index Y
-            ir.IndexType.get(),  # %tz : index, Thread index Z
-            ir.IndexType.get(),  # %num_bx : index, Grid size X
-            ir.IndexType.get(),  # %num_by : index, Grid size Y
-            ir.IndexType.get(),  # %num_bz : index, Grid size Z
-            ir.IndexType.get(),  # %num_tx : index, Block size X
-            ir.IndexType.get(),  # %num_ty : index, Block size Y
-            ir.IndexType.get(),  # %num_tz : index, Block size Z
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_idx, block_idy, block_idz 
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # thread_idx , thread_idy, thread_idz 
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # grid_size x, grid_size y, grid_size z 
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_size x, block_size y, block_size z
         ]
     )
 
     with ir.InsertionPoint(gpu_kernel_block):
         tIdX = gpu_kernel_block.arguments[3]
         cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0))
-        for1 = scf.ForOp(
+        loop = scf.ForOp(
             lower_bound=tIdX,
             upper_bound=size,
             step=gpu_kernel.blockSizeX
         )
-        with ir.InsertionPoint(for1.body):
-            load = memref.LoadOp(arg0, [for1.induction_variable])
+        with ir.InsertionPoint(loop.body):
+            load = memref.LoadOp(input_reshape, [loop.induction_variable])
             result = arith.MaxNumFOp(load, cst_0)
-            memref.StoreOp(result, arg0, [for1.induction_variable])
+            memref.StoreOp(result, input_reshape, [loop.induction_variable])
             scf.YieldOp([])
         
         gpu.TerminatorOp()
+    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+    memref.CopyOp(input, output)
+    return output
+
+# TODO: Implement Reshape Operation on GPU in future revisions.
+
+def reshape_op(node: ReshapeOp, symbol_table):
+    """
+    Import the reshape operation.
+    From buddy graph ir's `ReshapeOp` operator to MLIR Memref `reshape`
+    operation.
+
+    Note: If the new shape contains one and only one `-1`, the size of the new
+    shape will be inferred automatically.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    new_shape = []
+    for i in node.args[1]:
+        new_shape.append(i)
+    output_shape = list(node.tensor_meta["shape"])
+    total_size = tensor_shape_size(output_shape)
+
+    neg_one_cnt = 0
+    rest_size = 1
+    for dim_siz in new_shape:
+        if dim_siz == -1:
+            neg_one_cnt += 1
+            continue
+        rest_size *= dim_siz
+
+    if neg_one_cnt != 0:
+        if neg_one_cnt > 1 or total_size % rest_size != 0:
+            raise ValueError("Can not infer the new shape!")
+        infer_dim_size = total_size // rest_size
+        for i, _ in enumerate(new_shape):
+            if new_shape[i] == -1:
+                new_shape[i] = infer_dim_size
+
+    shape = memref.AllocOp(ir.MemRefType.get([len(new_shape)], ir.IndexType.get()), [], [])
+    for i, _ in enumerate(new_shape):
+        c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))
+        size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), new_shape[i]))
+        memref.StoreOp(size, shape, [c])
+
+    dtype = node.tensor_meta["dtype"]
+    element_type = mlir_element_type_get(dtype)
+    output_type = ir.MemRefType.get(new_shape, element_type)
+    op = memref.ReshapeOp(output_type, input1, shape)
+
     return op
 
+# TODO: Implement Permute Operation on GPU in future revisions.
+
+def permute_op(node: PermuteOp, symbol_table):
+    """
+    Import the permute operation.
+    From buddy graph ir's `PermuteOp` operator to MLIR Memref `transpose`
+    operation.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    perm = node.args[1]
+    perm_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm))
+
+    output_shape = list(node.tensor_meta["shape"])
+    element_type = mlir_element_type_get(node.tensor_meta["dtype"])
+    input_shape = [0] * len(output_shape)
+    for i, p in enumerate(perm):
+        input_shape[p] = output_shape[i]
+
+    offset = 0
+    input_stride = generate_strides(input_shape)
+    output_stride = transpose_strides(input_stride, perm)
+    result_type = ir.MemRefType.get(
+        shape=output_shape,
+        element_type=element_type,
+        layout=ir.StridedLayoutAttr.get(offset, output_stride)
+    )
+    permute_op = memref.TransposeOp(
+        result=result_type,
+        in_=input1,
+        permutation=perm_attr
+    )
+    return permute_op
+
 ops_registry = {
-    ReluOp: relu_op
+    "ReluOp": relu_op,
+    "ViewOp": reshape_op,
+    "PermuteOp": permute_op
 }
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
index 1217c6af2c..2b2dfe4ca2 100644
--- a/frontend/Python/ops/utils.py
+++ b/frontend/Python/ops/utils.py
@@ -55,4 +55,45 @@ def mlir_element_attr_get(type_name, value):
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
 
 
-def tensor_shape_size()
\ No newline at end of file
+def tensor_shape_size(shape):
+    """
+    Calculate the product of all dimensions in the given shape list, 
+    which represents the size of the tensor.
+    Args:
+        shape: A list containing the sizes of each dimension of the tensor.
+    """
+    size = 1
+    for dim in shape:
+        size *= dim
+    return size
+
+def generate_strides(shape):
+    """
+    Generate strides based on the input matrix shape.
+    
+    Args:
+        shape (list[int]): The shape of the input matrix, e.g., [2, 3, 4].
+
+    Returns:
+        list[int]: The corresponding strides, e.g., [12, 4, 1].
+    """
+    strides = []
+    stride = 1
+    for dim in reversed(shape):
+        strides.insert(0, stride)
+        stride *= dim
+    return strides
+
+def transpose_strides(strides, permutation):
+    """
+    Reorder strides based on the input permutation.
+    
+    Args:
+        strides (list[int]): The original strides list, e.g., [12, 4, 1].
+        permutation (list[int]): The permutation order, e.g., [1, 2, 0].
+
+    Returns:
+        list[int]: The reordered strides list, e.g., [4, 1, 12].
+    """
+    transposed_strides = [strides[i] for i in permutation]
+    return transposed_strides

From 2d4eef1c9ea58e807437312a6d3e4771b2399cff Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Mon, 12 Aug 2024 09:17:20 +0000
Subject: [PATCH 08/29] delete env.sh

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 485cccfcf9..69426a81de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 
 # Clangd cache
 .cache
+
+# environment bash
+env.sh
\ No newline at end of file

From 78f6bca5125ce773d79ce7c16a2d36fc416bd1af Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Mon, 12 Aug 2024 09:19:16 +0000
Subject: [PATCH 09/29] delete env.sh

---
 env.sh | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 env.sh

diff --git a/env.sh b/env.sh
deleted file mode 100644
index fccbb02918..0000000000
--- a/env.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-cd build/
-export BUDDY_MLIR_BUILD_DIR=$PWD
-export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
-export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
-
-export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/
-cd ../
\ No newline at end of file

From abce38285b8176164e826d81810f10437a0f3d32 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Fri, 16 Aug 2024 01:50:39 +0000
Subject: [PATCH 10/29] [BuddyTest] Add Test Model E2E example.

---
 examples/BuddyTest/.gitignore     |  3 +-
 examples/BuddyTest/CMakeLists.txt | 29 ++++++++++++
 examples/BuddyTest/import-test.py |  3 +-
 examples/BuddyTest/makefile       | 38 +++++++++++++++
 examples/BuddyTest/model.py       |  4 +-
 examples/BuddyTest/test-main.cpp  | 79 +++++++++++++++++++++++++++++++
 examples/CMakeLists.txt           |  4 ++
 7 files changed, 155 insertions(+), 5 deletions(-)
 create mode 100644 examples/BuddyTest/CMakeLists.txt
 create mode 100644 examples/BuddyTest/makefile
 create mode 100644 examples/BuddyTest/test-main.cpp

diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore
index 6e9797bbe9..081f173509 100644
--- a/examples/BuddyTest/.gitignore
+++ b/examples/BuddyTest/.gitignore
@@ -1,2 +1,3 @@
 __pycache__
-forward.mlir
+*.mlir
+log.ll
diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt
new file mode 100644
index 0000000000..2e3654b347
--- /dev/null
+++ b/examples/BuddyTest/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_custom_command(
+  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir
+  COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyTest/import-test.py
+  COMMENT "Generating forward.mlir"
+)
+
+
+add_custom_command(
+  OUTPUT forward.o
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_MLIR_BINARY_DIR}/llvm-as |
+          ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir
+  COMMENT "Building forward.o"
+  VERBATIM)
+  
+  
+add_library(TEST STATIC forward.o)
+
+SET_TARGET_PROPERTIES(TEST PROPERTIES LINKER_LANGUAGE C)
+
+add_executable(buddy-test-run test-main.cpp)
+target_link_directories(buddy-test-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+
+set(BUDDY_TEST_LIBS TEST mlir_runner_utils mlir_cuda_runtime)
+target_link_libraries(buddy-test-run ${BUDDY_TEST_LIBS})
diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
index b47bba9b21..3cd2573830 100644
--- a/examples/BuddyTest/import-test.py
+++ b/examples/BuddyTest/import-test.py
@@ -47,9 +47,8 @@
 
 assert len(graphs) == 1
 graph = graphs[0]
-print(graph._body)
 graph.lower_to_top_level_ir()
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
     print(graph._imported_module, file=module_file)
-    
\ No newline at end of file
+      
\ No newline at end of file
diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile
new file mode 100644
index 0000000000..7b9dd646f9
--- /dev/null
+++ b/examples/BuddyTest/makefile
@@ -0,0 +1,38 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+OPT_FLAG := -O0
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so
+MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
+MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+gpu-test-lower:
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_OPT} -o log.mlir
+
+gpu-test-translate:
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+
+gpu-test-run:
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
index 67f3bfdafd..fed677d6be 100644
--- a/examples/BuddyTest/model.py
+++ b/examples/BuddyTest/model.py
@@ -26,5 +26,5 @@ def __init__(self):
         super(TestModule, self).__init__()
 
     def forward(self, x):
-        x = torch.relu(x.view(2, 14, 28))
-        return x.permute([1, 2, 0])
+        x = torch.relu(x.view(28, 28))
+        return x.permute([1,0])
diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp
new file mode 100644
index 0000000000..e53c25192c
--- /dev/null
+++ b/examples/BuddyTest/test-main.cpp
@@ -0,0 +1,79 @@
+//===- test-main.cpp ------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#include <buddy/Core/Container.h>
+#include <buddy/LLM/TextContainer.h>
+#include <filesystem>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace buddy;
+
+extern "C" void
+_mlir_ciface_forward(MemRef<float, 2> *result, MemRef<float, 4> *input);
+
+int main() {
+  /// Initialize data containers.
+  MemRef<float, 4> input({1, 1, 28, 28});  
+  MemRef<float, 2> result({28, 28});
+
+  for (int i = 0; i < 28; i++) {
+    for (int j = 0; j < 28; j++) {
+      int index = i * 28 + j;
+      input[index] = static_cast<float>(index);
+    }
+  }
+  // Print the generated data to verify
+  for (int i = 0; i < 28; i++) {
+    for (int j = 0; j < 28; j++) {
+      std::cout << input[i * 28 + j] << " ";
+    }
+    std::cout << std::endl;
+  }
+
+  const auto inferenceStart = std::chrono::high_resolution_clock::now();
+
+  /// Execute forward inference of the model.
+  _mlir_ciface_forward(&result, &input);
+  
+  const auto inferenceEnd = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double, std::milli> inferenceTime =
+        inferenceEnd - inferenceStart;
+
+  /// Print the output data for verification.
+  std::cout << "\033[33;1m[Output] \033[0m";
+  std::cout << "[";
+  for (int i = 0; i < 28; i++) {
+    if (i > 0) std::cout << " ";
+    std::cout << "[";
+    for (int j = 0; j < 28; j++) {
+      if (j > 0) std::cout << " ";
+      std::cout << result[i * 28 + j];
+    }
+    std::cout << "]";
+    if (i < 27) std::cout << "\n ";
+  }
+  std::cout << "]" << std::endl;
+
+  /// Print the performance.
+  std::cout << "\033[33;1m[Time] \033[0m";
+  std::cout << inferenceTime.count() << " ms"
+            << std::endl;
+
+  return 0;
+}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0b575f3f4a..a9c0a54e30 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -16,6 +16,10 @@ if (BUDDY_LENET_EXAMPLES)
   add_subdirectory(BuddyLeNet)
 endif()
 
+if (BUDDY_TEST_EXAMPLES)
+  add_subdirectory(BuddyTest)
+endif()
+
 if (BUDDY_MOBILENETV3_EXAMPLES)
   add_subdirectory(BuddyMobileNetV3)
 endif()

From 3d00fe6b58f51e14bde743dd8a0d4cac1372960b Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Fri, 16 Aug 2024 03:44:09 +0000
Subject: [PATCH 11/29] [BuddyTest] Add README.

---
 examples/BuddyTest/README | 65 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 examples/BuddyTest/README

diff --git a/examples/BuddyTest/README b/examples/BuddyTest/README
new file mode 100644
index 0000000000..49cb8fa64f
--- /dev/null
+++ b/examples/BuddyTest/README
@@ -0,0 +1,65 @@
+# Buddy Compiler Test Example
+
+0. Activate your python environment.
+
+1. Build LLVM/MLIR
+
+```bash
+$ cd buddy-mlir
+$ mkdir llvm/build
+$ cd llvm/build
+$ cmake -G Ninja ../llvm \
+    -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \
+    -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \
+    -DMLIR_ENABLE_CUDA_RUNNER=ON \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+    -DPython3_EXECUTABLE=$(which python3)
+$ ninja check-clang check-mlir omp
+```
+
+2. Build buddy-mlir
+
+```bash
+$ mkdir build && cd build
+$ cmake -G Ninja .. \
+    -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
+    -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
+    -DPython3_EXECUTABLE=$(which python3) 
+$ ninja
+$ ninja check-buddy
+```
+
+3. Set the `PYTHONPATH` environment variable.
+
+Make sure you are in the build directory.
+
+```bash
+$ export BUDDY_MLIR_BUILD_DIR=$PWD
+$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
+```
+
+4. Build and run the Test example
+
+```bash
+$ cmake -G Ninja .. -DBUDDY_TEST_EXAMPLES=ON
+$ ninja buddy-test-run
+$ cd bin
+$ ./buddy-test-run
+```
+
+## Debug the Lowering Pass Pipeline with Fake Parameters.
+
+```bash
+$ cd buddy-mlir
+$ cd examples/BuddyTest
+$ make gpu-test-lower
+$ make gpu-test-translate
+$ make gpu-test-run
+```

From ae794aaa63d187aee49ec4c146728ce10af35eb3 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Fri, 16 Aug 2024 03:45:13 +0000
Subject: [PATCH 12/29] [BuddyTest] Add README.

---
 examples/BuddyTest/{README => README.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/BuddyTest/{README => README.md} (100%)

diff --git a/examples/BuddyTest/README b/examples/BuddyTest/README.md
similarity index 100%
rename from examples/BuddyTest/README
rename to examples/BuddyTest/README.md

From b57103c4cd85b873d7459288b785379058160c4b Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Fri, 30 Aug 2024 08:30:23 +0000
Subject: [PATCH 13/29] [frontend] Add GPU MLIR lowering path with Conv2d
 operation support

---
 examples/BuddyTest/README.md      |   2 +-
 examples/BuddyTest/import-test.py |   2 +-
 examples/BuddyTest/makefile       |  18 +++
 examples/BuddyTest/model.py       |   5 +-
 examples/BuddyTest/test-main.cpp  |  66 +++++---
 frontend/Python/ops/gpu.py        | 240 ++++++++++++++++++++++++++++--
 6 files changed, 299 insertions(+), 34 deletions(-)

diff --git a/examples/BuddyTest/README.md b/examples/BuddyTest/README.md
index 49cb8fa64f..f057723bb3 100644
--- a/examples/BuddyTest/README.md
+++ b/examples/BuddyTest/README.md
@@ -10,7 +10,7 @@ $ mkdir llvm/build
 $ cd llvm/build
 $ cmake -G Ninja ../llvm \
     -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \
-    -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \
+    -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \
     -DMLIR_ENABLE_CUDA_RUNNER=ON \
     -DLLVM_ENABLE_ASSERTIONS=ON \
     -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
index 3cd2573830..e2e863a66c 100644
--- a/examples/BuddyTest/import-test.py
+++ b/examples/BuddyTest/import-test.py
@@ -40,7 +40,7 @@
     aot_autograd_decomposition=inductor_decomp,
 )
 
-data = torch.randn([1, 1, 28, 28])
+data = torch.randn([1, 6, 32, 32])
 # Import the model into MLIR module and parameters.
 with torch.no_grad():
     graphs = dynamo_compiler.importer(model, data)
diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile
index 7b9dd646f9..02aba04064 100644
--- a/examples/BuddyTest/makefile
+++ b/examples/BuddyTest/makefile
@@ -36,3 +36,21 @@ gpu-test-run:
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
 	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
 	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
+
+gpu-conv2d-lower:
+	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_OPT} -o log.mlir
+
+gpu-conv2d-translate:
+	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+
+gpu-conv2d-run:
+	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
index fed677d6be..0439ed70f4 100644
--- a/examples/BuddyTest/model.py
+++ b/examples/BuddyTest/model.py
@@ -24,7 +24,8 @@
 class TestModule(nn.Module):
     def __init__(self):
         super(TestModule, self).__init__()
+        self.conv1 = nn.Conv2d(6, 1, 5)
 
     def forward(self, x):
-        x = torch.relu(x.view(28, 28))
-        return x.permute([1,0])
+        x = self.conv1(x)
+        return x
diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp
index e53c25192c..c9d0c60801 100644
--- a/examples/BuddyTest/test-main.cpp
+++ b/examples/BuddyTest/test-main.cpp
@@ -25,31 +25,63 @@
 using namespace buddy;
 
 extern "C" void
-_mlir_ciface_forward(MemRef<float, 2> *result, MemRef<float, 4> *input);
+_mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *filter, MemRef<float, 1> *bias, MemRef<float, 4> *input);
 
 int main() {
   /// Initialize data containers.
-  MemRef<float, 4> input({1, 1, 28, 28});  
-  MemRef<float, 2> result({28, 28});
+  const int N = 1;
+  const int C = 6;
+  const int K = 1;
+  const int kernel_size = 5;
+  const int H = 32;
+  const int W = 32;
+  const int H_out = H - kernel_size + 1;
+  const int W_out = W - kernel_size + 1;
 
-  for (int i = 0; i < 28; i++) {
-    for (int j = 0; j < 28; j++) {
-      int index = i * 28 + j;
-      input[index] = static_cast<float>(index);
+  MemRef<float, 4> input({N, C, H, W});  
+  MemRef<float, 4> filter({K, C, kernel_size, kernel_size});  
+  MemRef<float, 1> bias({K});  
+  MemRef<float, 4> result({N, K, H_out, W_out});
+
+  // Initial the input data
+  for (int n = 0; n < N; n++) { 
+    for (int c = 0; c < C; c++) {
+      for (int i = 0; i < H; i++) {
+        for (int j = 0; j < W; j++) {
+          int index = n * C * H * W + c * H * W + i * W + j;
+          input[index] = static_cast<float>(1);
+        }
+      }
     }
   }
-  // Print the generated data to verify
-  for (int i = 0; i < 28; i++) {
-    for (int j = 0; j < 28; j++) {
-      std::cout << input[i * 28 + j] << " ";
+  for (int k = 0; k < K; k++) { 
+    for (int c = 0; c < C; c++) {
+      for (int i = 0; i < kernel_size; i++) {
+        for (int j = 0; j < kernel_size; j++) {
+          int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j;
+          filter[index] = static_cast<float>(1);
+        }
+      }
     }
-    std::cout << std::endl;
   }
+  
+  for (int k = 0; k < K; k++) {
+    bias[k] = 1; 
+  }
+
+  // Print the generated data to verify
+
+  // for (int i = 0; i < H; i++) {
+  //   for (int j = 0; j < W; j++) {
+  //     std::cout << input[i * W + j] << " ";
+  //   }
+  //   std::cout << std::endl;
+  // }
 
   const auto inferenceStart = std::chrono::high_resolution_clock::now();
 
   /// Execute forward inference of the model.
-  _mlir_ciface_forward(&result, &input);
+  _mlir_ciface_forward(&result, &filter, &bias, &input);
   
   const auto inferenceEnd = std::chrono::high_resolution_clock::now();
   const std::chrono::duration<double, std::milli> inferenceTime =
@@ -58,15 +90,15 @@ int main() {
   /// Print the output data for verification.
   std::cout << "\033[33;1m[Output] \033[0m";
   std::cout << "[";
-  for (int i = 0; i < 28; i++) {
+  for (int i = 0; i < H_out; i++) {
     if (i > 0) std::cout << " ";
     std::cout << "[";
-    for (int j = 0; j < 28; j++) {
+    for (int j = 0; j < W_out; j++) {
       if (j > 0) std::cout << " ";
-      std::cout << result[i * 28 + j];
+      std::cout << result[i * W_out + j];
     }
     std::cout << "]";
-    if (i < 27) std::cout << "\n ";
+    if (i < H_out - 1) std::cout << "\n ";
   }
   std::cout << "]" << std::endl;
 
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index 2eff78f97d..31654e3274 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -27,10 +27,14 @@
 from ..graph import (
     ReluOp,
     ReshapeOp,
-    PermuteOp
+    PermuteOp,
+    Conv2dOp,
+    MaxPool2dOp
 )
 from .utils import *
 
+TILE_WIDTH = 16
+
 def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
     """
     Import the buddy ReluOp.
@@ -62,24 +66,37 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
     gpu_kernel = gpu.LaunchOp(
         asyncToken=None,
         asyncDependencies=[],
-        gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result,
-        blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result,
+        gridSizeX=c1.result, 
+        gridSizeY=c1.result, 
+        gridSizeZ=c1.result,
+        blockSizeX=kernels.result, 
+        blockSizeY=c1.result, 
+        blockSizeZ=c1.result,
     )
     gpu_kernel_block = ir.Block.create_at_start(
         gpu_kernel.body,
         [
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_idx, block_idy, block_idz 
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # thread_idx , thread_idy, thread_idz 
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # grid_size x, grid_size y, grid_size z 
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_size x, block_size y, block_size z
+            ir.IndexType.get(),     # block_id x
+            ir.IndexType.get(),     # block_id y 
+            ir.IndexType.get(),     # block_id z 
+            ir.IndexType.get(),     # thread_id x
+            ir.IndexType.get(),     # thread_id y  
+            ir.IndexType.get(),     # thread_id z
+            ir.IndexType.get(),     # grid_size x
+            ir.IndexType.get(),     # grid_size y
+            ir.IndexType.get(),     # grid_size z
+            ir.IndexType.get(),     # block_size x
+            ir.IndexType.get(),     # block_size y
+            ir.IndexType.get(),     # block_size z
         ]
     )
 
     with ir.InsertionPoint(gpu_kernel_block):
-        tIdX = gpu_kernel_block.arguments[3]
-        cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0))
+        thread_local_idx = gpu_kernel_block.arguments[3]
+        element_attr = mlir_element_attr_get(dtype, 0.0)
+        cst_0 = arith.ConstantOp(element_type, element_attr)
         loop = scf.ForOp(
-            lower_bound=tIdX,
+            lower_bound=thread_local_idx,
             upper_bound=size,
             step=gpu_kernel.blockSizeX
         )
@@ -94,8 +111,8 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
     memref.CopyOp(input, output)
     return output
 
-# TODO: Implement Reshape Operation on GPU in future revisions.
 
+# TODO: Implement Reshape Operation on GPU in future revisions.
 def reshape_op(node: ReshapeOp, symbol_table):
     """
     Import the reshape operation.
@@ -141,8 +158,8 @@ def reshape_op(node: ReshapeOp, symbol_table):
 
     return op
 
-# TODO: Implement Permute Operation on GPU in future revisions.
 
+# TODO: Implement Permute Operation on GPU in future revisions.
 def permute_op(node: PermuteOp, symbol_table):
     """
     Import the permute operation.
@@ -174,8 +191,205 @@ def permute_op(node: PermuteOp, symbol_table):
     )
     return permute_op
 
+
+# TODO: Consider the cases where the arguments take different values.
+def convolution2d_op(node: Conv2dOp, symbol_table):
+    """
+    Import the convolution operation.
+    From Buddy Conv2dOp to MLIR GPU `conv2d` kernel.
+    arg[0]: Tensor input
+    arg[1]: Tensor weight
+    arg[2]: Tensor? bias
+    arg[3]: SymInt[] stride
+    arg[4]: SymInt[] padding
+    arg[5]: SymInt[] dilation
+    arg[6]: bool transposed
+    arg[7]: SymInt[] output_padding
+    arg[8]: SymInt groups
+    """
+    # Get arguments from convolution node.
+    assert len(node.args) == 9
+    input = node.args[0]
+    filter = node.args[1]
+    bias = node.args[2]
+    stride = node.args[3]
+    input_padding = node.args[4]
+    dilation = node.args[5]
+    is_kernel_transposed = node.args[6]
+    out_padding = node.args[7]
+    groups = node.args[8]
+
+    # TODO: Consider the cases where the variables take different values.
+    assert input_padding[0] == input_padding[1] == 0
+    assert dilation[0] == dilation[1] == 1
+    assert is_kernel_transposed == False
+    assert out_padding[0] == out_padding[1] == 0
+    assert groups == 1
+
+    # Prepare input, filter, and output information.
+    input_val = symbol_table.get((str(input), 0))
+    input_shape = list(ir.MemRefType(input_val.type).shape)
+    filter_val = symbol_table.get((str(filter), 0))
+    filter_shape = ir.MemRefType(filter_val.type).shape
+    bias_val = symbol_table.get((str(bias), 0))
+    dtype = node.tensor_meta["dtype"]
+    element_type = mlir_element_type_get(dtype)
+    output_shape = list(node.tensor_meta["shape"])
+
+    batch_size = input_shape[0]
+    in_channels = input_shape[1]
+    out_channels = output_shape[0]
+    H_in = input_shape[2]
+    W_in = input_shape[3]
+    H_out = output_shape[2]
+    W_out = output_shape[3]
+    H_filter = filter_shape[2]
+    W_filter = filter_shape[3]
+
+    output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    input_cast = memref.CastOp(unranked_memref_type, input_val)
+    filter_cast = memref.CastOp(unranked_memref_type, filter_val)
+    output_cast = memref.CastOp(unranked_memref_type, output_val)
+
+    gpu.HostRegisterOp(input_cast)
+    gpu.HostRegisterOp(filter_cast)
+    gpu.HostRegisterOp(output_cast)
+
+    # Tile the input_val into Grids
+    block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH)
+    batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
+    in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
+    out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels))
+    block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z))
+    tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH))
+    H_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_filter))
+    W_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_filter))
+    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
+    
+    # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1)        numBlocks(N, K, block_z)
+    
+    gpu_kernel = gpu.LaunchOp(
+        asyncToken=None,
+        asyncDependencies=[],
+        gridSizeX=batch_size_val.result,
+        gridSizeY=out_channels_val.result,
+        gridSizeZ=block_z_val.result,
+        blockSizeX=tile_width_val.result,
+        blockSizeY=tile_width_val.result,
+        blockSizeZ=c1.result,
+    )
+
+    gpu_kernel_block = ir.Block.create_at_start(
+        gpu_kernel.body,
+        [
+            ir.IndexType.get(),     # block_id x
+            ir.IndexType.get(),     # block_id y 
+            ir.IndexType.get(),     # block_id z 
+            ir.IndexType.get(),     # thread_id x
+            ir.IndexType.get(),     # thread_id y  
+            ir.IndexType.get(),     # thread_id z
+            ir.IndexType.get(),     # grid_size x
+            ir.IndexType.get(),     # grid_size y
+            ir.IndexType.get(),     # grid_size z
+            ir.IndexType.get(),     # block_size x
+            ir.IndexType.get(),     # block_size y
+            ir.IndexType.get(),     # block_size z
+        ]
+    )
+
+    with ir.InsertionPoint(gpu_kernel_block):
+        batch_id = gpu_kernel_block.arguments[0]    
+        out_channel_id = gpu_kernel_block.arguments[1]    
+        tile_id = gpu_kernel_block.arguments[2] 
+        thread_local_idx = gpu_kernel_block.arguments[3]  
+        thread_local_idy = gpu_kernel_block.arguments[4]
+
+        # Calculate the convolution element at (h, w) for this thread
+        tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH
+        tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
+        
+        t0 = arith.divui(tile_id, tile_num_val)
+        t1 = arith.muli(t0, tile_width_val)
+        thread_global_idx = arith.addi(t1, thread_local_idx)
+
+        t2 = arith.remui(tile_id, tile_num_val)
+        t3 = arith.muli(t2, tile_width_val)
+        thread_global_idy = arith.addi(t3, thread_local_idy)
+
+        stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0]))
+        stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1]))
+        t4 = arith.muli(thread_global_idx, stride_h)
+        t5 = arith.muli(thread_global_idy, stride_w)
+
+        # Check if the (h, w) is out of the output bounds
+        ult = 6
+        H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out))
+        W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out))
+        isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val)
+        isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val)
+        isInBounds = arith.andi(isHInBounds, isWInBounds)
+        
+        cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0))
+        branch0 = scf.IfOp(isInBounds)
+        with ir.InsertionPoint(branch0.then_block):
+            loop0 = scf.ForOp(
+                lower_bound=c0.result,
+                upper_bound=in_channels_val.result,
+                step=c1.result,
+                iter_args=[cst_0.result]
+            )
+            with ir.InsertionPoint(loop0.body):
+                loop1 = scf.ForOp(
+                    lower_bound=c0.result,
+                    upper_bound=H_filter_val.result,
+                    step=c1.result,
+                    iter_args=[cst_0.result]
+                )
+                with ir.InsertionPoint(loop1.body):
+                    loop2 = scf.ForOp(
+                        lower_bound=c0.result,
+                        upper_bound=W_filter_val.result,
+                        step=c1.result,
+                        iter_args=[cst_0.result]
+                    )
+                    with ir.InsertionPoint(loop2.body):
+                        # TODO : loop body
+                        in_channel_id = loop0.body.arguments[0]
+                        filter_ele_idx = loop1.body.arguments[0]
+                        filter_ele_idy = loop2.body.arguments[0]
+                        input_ele_idx = arith.addi(t4, filter_ele_idx)
+                        input_ele_idy = arith.addi(t5, filter_ele_idy)
+                        input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy])
+                        filter_ele = memref.LoadOp(filter_val, [out_channel_id, in_channel_id, filter_ele_idx, filter_ele_idy])
+                        t6 = arith.mulf(input_ele, filter_ele)
+                        iter_arg2 = loop2.body.arguments[1]
+                        iter_res2 = arith.addf(iter_arg2, t6)
+                        scf.YieldOp([iter_res2])
+
+                    iter_arg1 = loop1.body.arguments[1]
+                    iter_res1 = arith.addf(loop2, iter_arg1)
+                    scf.YieldOp([iter_res1])
+
+                iter_arg0 = loop0.body.arguments[1]
+                iter_res0 = arith.addf(loop1, iter_arg0)
+                scf.YieldOp([iter_res0])
+
+            # Add bias data for any out_channel.
+            bias_ele = memref.LoadOp(bias_val, [out_channel_id])
+            result = arith.addf(loop0, bias_ele)
+            memref.StoreOp(result, output_val, [batch_id, out_channel_id, thread_global_idx, thread_global_idy])
+            scf.YieldOp([])
+                
+        gpu.TerminatorOp()
+
+    return output_val
+
+
 ops_registry = {
     "ReluOp": relu_op,
     "ViewOp": reshape_op,
-    "PermuteOp": permute_op
+    "PermuteOp": permute_op,
+    "Conv2dOp": convolution2d_op,
 }

From 0adf1df8769e245538e5fbc713f8ea68c96c8d56 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Mon, 2 Sep 2024 13:48:10 +0000
Subject: [PATCH 14/29] [frontend] Add GPU MLIR lowering path with MaxPool2d
 operation support

---
 examples/BuddyTest/import-test.py |   2 +-
 examples/BuddyTest/model.py       |   4 +-
 examples/BuddyTest/test-main.cpp  |  50 +++++-----
 frontend/Python/ops/gpu.py        | 150 ++++++++++++++++++++++++++++++
 4 files changed, 181 insertions(+), 25 deletions(-)

diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
index e2e863a66c..5636ba5e07 100644
--- a/examples/BuddyTest/import-test.py
+++ b/examples/BuddyTest/import-test.py
@@ -40,7 +40,7 @@
     aot_autograd_decomposition=inductor_decomp,
 )
 
-data = torch.randn([1, 6, 32, 32])
+data = torch.randn([1, 1, 32, 32])
 # Import the model into MLIR module and parameters.
 with torch.no_grad():
     graphs = dynamo_compiler.importer(model, data)
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
index 0439ed70f4..a6b6e7d71b 100644
--- a/examples/BuddyTest/model.py
+++ b/examples/BuddyTest/model.py
@@ -24,8 +24,10 @@
 class TestModule(nn.Module):
     def __init__(self):
         super(TestModule, self).__init__()
-        self.conv1 = nn.Conv2d(6, 1, 5)
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
 
     def forward(self, x):
         x = self.conv1(x)
+        x = self.pool(x)
         return x
diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp
index c9d0c60801..d1764bccd2 100644
--- a/examples/BuddyTest/test-main.cpp
+++ b/examples/BuddyTest/test-main.cpp
@@ -24,24 +24,28 @@
 
 using namespace buddy;
 
+// extern "C" void
+// _mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *filter, MemRef<float, 1> *bias, MemRef<float, 4> *input);
+
 extern "C" void
-_mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *filter, MemRef<float, 1> *bias, MemRef<float, 4> *input);
+_mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *input);
 
 int main() {
   /// Initialize data containers.
   const int N = 1;
-  const int C = 6;
+  const int C = 1;
   const int K = 1;
-  const int kernel_size = 5;
+  const int kernel_size = 2;
+  const int stride = 2;
   const int H = 32;
   const int W = 32;
-  const int H_out = H - kernel_size + 1;
-  const int W_out = W - kernel_size + 1;
+  const int H_out = H / kernel_size;
+  const int W_out = W / kernel_size;
 
   MemRef<float, 4> input({N, C, H, W});  
-  MemRef<float, 4> filter({K, C, kernel_size, kernel_size});  
-  MemRef<float, 1> bias({K});  
-  MemRef<float, 4> result({N, K, H_out, W_out});
+  // MemRef<float, 4> filter({K, C, kernel_size, kernel_size});  
+  // MemRef<float, 1> bias({K});  
+  MemRef<float, 4> result({N, C, H_out, W_out});
 
   // Initial the input data
   for (int n = 0; n < N; n++) { 
@@ -49,25 +53,25 @@ int main() {
       for (int i = 0; i < H; i++) {
         for (int j = 0; j < W; j++) {
           int index = n * C * H * W + c * H * W + i * W + j;
-          input[index] = static_cast<float>(1);
-        }
-      }
-    }
-  }
-  for (int k = 0; k < K; k++) { 
-    for (int c = 0; c < C; c++) {
-      for (int i = 0; i < kernel_size; i++) {
-        for (int j = 0; j < kernel_size; j++) {
-          int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j;
-          filter[index] = static_cast<float>(1);
+          input[index] = static_cast<float>((float)index/(H*W));
         }
       }
     }
   }
+  // for (int k = 0; k < K; k++) { 
+  //   for (int c = 0; c < C; c++) {
+  //     for (int i = 0; i < kernel_size; i++) {
+  //       for (int j = 0; j < kernel_size; j++) {
+  //         int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j;
+  //         filter[index] = static_cast<float>(1);
+  //       }
+  //     }
+  //   }
+  // }
   
-  for (int k = 0; k < K; k++) {
-    bias[k] = 1; 
-  }
+  // for (int k = 0; k < K; k++) {
+  //   bias[k] = 1; 
+  // }
 
   // Print the generated data to verify
 
@@ -81,7 +85,7 @@ int main() {
   const auto inferenceStart = std::chrono::high_resolution_clock::now();
 
   /// Execute forward inference of the model.
-  _mlir_ciface_forward(&result, &filter, &bias, &input);
+  _mlir_ciface_forward(&result, &input);
   
   const auto inferenceEnd = std::chrono::high_resolution_clock::now();
   const std::chrono::duration<double, std::milli> inferenceTime =
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index 31654e3274..c99f074ec5 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -387,9 +387,159 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     return output_val
 
 
+# TODO: Consider the cases where the maxpool2d operation needs padding.
+def maxpool2d_op(node: MaxPool2dOp, symbol_table):
+    """
+    Import the maxpool2d operation.
+    From Buddy MaxPool2dOp to MLIR GPU `max_pool2d` kernel.
+    """
+    if len(node.args) == 5:
+        raise NotImplementedError
+    input1 = node.args[0]
+    kernel = node.args[1]
+    stride = node.args[2]
+
+    # Prepare padding data
+    if len(node.args) > 3:
+        pad = node.args[3]
+    else:
+        pad = [0 for _ in kernel]
+
+    dtype = node.tensor_meta["dtype"]
+    element_type = mlir_element_type_get(dtype)
+    output_shape = node.tensor_meta["shape"]
+
+    batch_size = output_shape[0]
+    in_channels = output_shape[1]
+    H_out = output_shape[2]
+    W_out = output_shape[3]
+
+    input_val = symbol_table.get((str(input1), 0))
+    output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    input_cast = memref.CastOp(unranked_memref_type, input_val)
+    output_cast = memref.CastOp(unranked_memref_type, output_val)
+
+    gpu.HostRegisterOp(input_cast)
+    gpu.HostRegisterOp(output_cast)
+
+    # Tile the input_val into Grids
+    block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH)
+    batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
+    in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
+    block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z))
+    tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH))
+    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
+    
+    # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1)        numBlocks(N, K, block_z)
+    
+    gpu_kernel = gpu.LaunchOp(
+        asyncToken=None,
+        asyncDependencies=[],
+        gridSizeX=batch_size_val.result,
+        gridSizeY=in_channels_val.result,
+        gridSizeZ=block_z_val.result,
+        blockSizeX=tile_width_val.result,
+        blockSizeY=tile_width_val.result,
+        blockSizeZ=c1.result,
+    )
+
+    gpu_kernel_block = ir.Block.create_at_start(
+        gpu_kernel.body,
+        [
+            ir.IndexType.get(),     # block_id x
+            ir.IndexType.get(),     # block_id y 
+            ir.IndexType.get(),     # block_id z 
+            ir.IndexType.get(),     # thread_id x
+            ir.IndexType.get(),     # thread_id y  
+            ir.IndexType.get(),     # thread_id z
+            ir.IndexType.get(),     # grid_size x
+            ir.IndexType.get(),     # grid_size y
+            ir.IndexType.get(),     # grid_size z
+            ir.IndexType.get(),     # block_size x
+            ir.IndexType.get(),     # block_size y
+            ir.IndexType.get(),     # block_size z
+        ]
+    )
+
+    with ir.InsertionPoint(gpu_kernel_block):
+        batch_id = gpu_kernel_block.arguments[0]    
+        in_channel_id = gpu_kernel_block.arguments[1]    
+        tile_id = gpu_kernel_block.arguments[2] 
+        thread_local_idx = gpu_kernel_block.arguments[3]  
+        thread_local_idy = gpu_kernel_block.arguments[4]
+
+        # Calculate the convolution element at (h, w) for this thread
+        tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH
+        tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
+        
+        t0 = arith.divui(tile_id, tile_num_val)
+        t1 = arith.muli(t0, tile_width_val)
+        thread_global_idx = arith.addi(t1, thread_local_idx)
+
+        t2 = arith.remui(tile_id, tile_num_val)
+        t3 = arith.muli(t2, tile_width_val)
+        thread_global_idy = arith.addi(t3, thread_local_idy)
+
+        kernel_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0]))
+        kernel_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1]))
+        stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0]))
+        stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1]))
+        t4 = arith.muli(thread_global_idx, stride_h)
+        t5 = arith.muli(thread_global_idy, stride_w)
+        
+        first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, t4, t5])
+
+        # Check if the (h, w) is out of the output bounds
+        ult = 6
+        H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out))
+        W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out))
+        isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val)
+        isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val)
+        isInBounds = arith.andi(isHInBounds, isWInBounds)
+        
+        branch0 = scf.IfOp(isInBounds)
+        with ir.InsertionPoint(branch0.then_block):
+            loop0 = scf.ForOp(
+                lower_bound=c0.result,
+                upper_bound=kernel_h.result,
+                step=c1.result,
+                iter_args=[first_ele.result]
+            )
+            with ir.InsertionPoint(loop0.body):
+                loop1 = scf.ForOp(
+                    lower_bound=c0.result,
+                    upper_bound=kernel_w.result,
+                    step=c1.result,
+                    iter_args=[first_ele.result]
+                )
+                with ir.InsertionPoint(loop1.body):
+                    # TODO : loop body
+                    kernel_ele_idx = loop0.body.arguments[0]
+                    kernel_ele_idy = loop1.body.arguments[0]
+                    input_ele_idx = arith.addi(t4, kernel_ele_idx)
+                    input_ele_idy = arith.addi(t5, kernel_ele_idy)
+                    input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy])
+                    iter_arg1 = loop1.body.arguments[1]
+                    iter_res1 = arith.maxnumf(iter_arg1, input_ele)
+                    scf.YieldOp([iter_res1])
+
+                iter_arg0 = loop0.body.arguments[1]
+                iter_res0 = arith.maxnumf(loop1, iter_arg0)
+                scf.YieldOp([iter_res0])
+
+            memref.StoreOp(loop0, output_val, [batch_id, in_channel_id, thread_global_idx, thread_global_idy])
+            scf.YieldOp([])
+                
+        gpu.TerminatorOp()
+
+    return output_val
+
 ops_registry = {
     "ReluOp": relu_op,
     "ViewOp": reshape_op,
     "PermuteOp": permute_op,
     "Conv2dOp": convolution2d_op,
+    "MaxPool2dOp": maxpool2d_op
 }

From f63634141a5fd8b9ebf0ed4a44fcc9dbf433a7d2 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Tue, 3 Sep 2024 02:12:11 +0000
Subject: [PATCH 15/29] [frontend] Fix Permute Op

---
 examples/BuddyTest/import-test.py |  3 +-
 examples/BuddyTest/model.py       |  8 +++-
 frontend/Python/ops/gpu.py        | 72 +++++++++++++++++--------------
 frontend/Python/ops/utils.py      | 31 -------------
 4 files changed, 48 insertions(+), 66 deletions(-)

diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
index 5636ba5e07..79620d9d44 100644
--- a/examples/BuddyTest/import-test.py
+++ b/examples/BuddyTest/import-test.py
@@ -40,13 +40,14 @@
     aot_autograd_decomposition=inductor_decomp,
 )
 
-data = torch.randn([1, 1, 32, 32])
+data = torch.randn([1, 1, 12, 10])
 # Import the model into MLIR module and parameters.
 with torch.no_grad():
     graphs = dynamo_compiler.importer(model, data)
 
 assert len(graphs) == 1
 graph = graphs[0]
+print(graph.body)
 graph.lower_to_top_level_ir()
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
index a6b6e7d71b..d72af61c95 100644
--- a/examples/BuddyTest/model.py
+++ b/examples/BuddyTest/model.py
@@ -26,8 +26,12 @@ def __init__(self):
         super(TestModule, self).__init__()
         self.conv1 = nn.Conv2d(1, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
+        self.fc1 = nn.Linear(120,84)
 
     def forward(self, x):
-        x = self.conv1(x)
-        x = self.pool(x)
+        # x = self.conv1(x)
+        # x = self.pool(x)
+        x = x.view(-1, 120)
+        x = self.fc1(x)
         return x
+    
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index c99f074ec5..042770a8b5 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -176,9 +176,15 @@ def permute_op(node: PermuteOp, symbol_table):
     for i, p in enumerate(perm):
         input_shape[p] = output_shape[i]
 
+    # Prepare input_stride and output_stride data
+    input_stride = []
+    stride = 1
+    for dim in reversed(input_shape):
+        input_stride.insert(0, stride)
+        stride *= dim
+    output_stride = [input_stride[i] for i in perm]
+
     offset = 0
-    input_stride = generate_strides(input_shape)
-    output_stride = transpose_strides(input_stride, perm)
     result_type = ir.MemRefType.get(
         shape=output_shape,
         element_type=element_type,
@@ -188,8 +194,10 @@ def permute_op(node: PermuteOp, symbol_table):
         result=result_type,
         in_=input1,
         permutation=perm_attr
-    )
-    return permute_op
+    )    
+    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+    memref.CopyOp(permute_op, output)
+    return output
 
 
 # TODO: Consider the cases where the arguments take different values.
@@ -239,10 +247,10 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     batch_size = input_shape[0]
     in_channels = input_shape[1]
     out_channels = output_shape[0]
-    H_in = input_shape[2]
-    W_in = input_shape[3]
-    H_out = output_shape[2]
-    W_out = output_shape[3]
+    in_size_h = input_shape[2]
+    in_size_w = input_shape[3]
+    out_size_h = output_shape[2]
+    out_size_w = output_shape[3]
     H_filter = filter_shape[2]
     W_filter = filter_shape[3]
 
@@ -257,7 +265,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     gpu.HostRegisterOp(output_cast)
 
     # Tile the input_val into Grids
-    block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH)
+    block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH)
     batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
     in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
     out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels))
@@ -307,7 +315,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
         thread_local_idy = gpu_kernel_block.arguments[4]
 
         # Calculate the convolution element at (h, w) for this thread
-        tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH
+        tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH
         tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
         
         t0 = arith.divui(tile_id, tile_num_val)
@@ -325,10 +333,10 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
 
         # Check if the (h, w) is out of the output bounds
         ult = 6
-        H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out))
-        W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out))
-        isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val)
-        isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val)
+        out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h))
+        out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w))
+        isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val)
+        isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val)
         isInBounds = arith.andi(isHInBounds, isWInBounds)
         
         cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0))
@@ -411,8 +419,8 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
 
     batch_size = output_shape[0]
     in_channels = output_shape[1]
-    H_out = output_shape[2]
-    W_out = output_shape[3]
+    out_size_h = output_shape[2]
+    out_size_w = output_shape[3]
 
     input_val = symbol_table.get((str(input1), 0))
     output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
@@ -424,7 +432,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
     gpu.HostRegisterOp(output_cast)
 
     # Tile the input_val into Grids
-    block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH)
+    block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH)
     batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
     in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
     block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z))
@@ -471,7 +479,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
         thread_local_idy = gpu_kernel_block.arguments[4]
 
         # Calculate the convolution element at (h, w) for this thread
-        tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH
+        tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH
         tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
         
         t0 = arith.divui(tile_id, tile_num_val)
@@ -482,35 +490,34 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
         t3 = arith.muli(t2, tile_width_val)
         thread_global_idy = arith.addi(t3, thread_local_idy)
 
-        kernel_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0]))
-        kernel_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1]))
+        kernel_size_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0]))
+        kernel_size_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1]))
         stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0]))
         stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1]))
-        t4 = arith.muli(thread_global_idx, stride_h)
-        t5 = arith.muli(thread_global_idy, stride_w)
-        
-        first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, t4, t5])
+        init_ele_idx = arith.muli(thread_global_idx, stride_h)
+        init_ele_idy = arith.muli(thread_global_idy, stride_w)
 
         # Check if the (h, w) is out of the output bounds
         ult = 6
-        H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out))
-        W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out))
-        isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val)
-        isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val)
+        out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h))
+        out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w))
+        isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val)
+        isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val)
         isInBounds = arith.andi(isHInBounds, isWInBounds)
         
         branch0 = scf.IfOp(isInBounds)
         with ir.InsertionPoint(branch0.then_block):
+            first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, init_ele_idx, init_ele_idy])
             loop0 = scf.ForOp(
                 lower_bound=c0.result,
-                upper_bound=kernel_h.result,
+                upper_bound=kernel_size_h.result,
                 step=c1.result,
                 iter_args=[first_ele.result]
             )
             with ir.InsertionPoint(loop0.body):
                 loop1 = scf.ForOp(
                     lower_bound=c0.result,
-                    upper_bound=kernel_w.result,
+                    upper_bound=kernel_size_w.result,
                     step=c1.result,
                     iter_args=[first_ele.result]
                 )
@@ -518,8 +525,8 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
                     # TODO : loop body
                     kernel_ele_idx = loop0.body.arguments[0]
                     kernel_ele_idy = loop1.body.arguments[0]
-                    input_ele_idx = arith.addi(t4, kernel_ele_idx)
-                    input_ele_idy = arith.addi(t5, kernel_ele_idy)
+                    input_ele_idx = arith.addi(init_ele_idx, kernel_ele_idx)
+                    input_ele_idy = arith.addi(init_ele_idy, kernel_ele_idy)
                     input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy])
                     iter_arg1 = loop1.body.arguments[1]
                     iter_res1 = arith.maxnumf(iter_arg1, input_ele)
@@ -536,6 +543,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
 
     return output_val
 
+
 ops_registry = {
     "ReluOp": relu_op,
     "ViewOp": reshape_op,
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
index 2b2dfe4ca2..012340d475 100644
--- a/frontend/Python/ops/utils.py
+++ b/frontend/Python/ops/utils.py
@@ -66,34 +66,3 @@ def tensor_shape_size(shape):
     for dim in shape:
         size *= dim
     return size
-
-def generate_strides(shape):
-    """
-    Generate strides based on the input matrix shape.
-    
-    Args:
-        shape (list[int]): The shape of the input matrix, e.g., [2, 3, 4].
-
-    Returns:
-        list[int]: The corresponding strides, e.g., [12, 4, 1].
-    """
-    strides = []
-    stride = 1
-    for dim in reversed(shape):
-        strides.insert(0, stride)
-        stride *= dim
-    return strides
-
-def transpose_strides(strides, permutation):
-    """
-    Reorder strides based on the input permutation.
-    
-    Args:
-        strides (list[int]): The original strides list, e.g., [12, 4, 1].
-        permutation (list[int]): The permutation order, e.g., [1, 2, 0].
-
-    Returns:
-        list[int]: The reordered strides list, e.g., [4, 1, 12].
-    """
-    transposed_strides = [strides[i] for i in permutation]
-    return transposed_strides

From 72cdc8269855e7b0baff3b22c280b7bdade9584b Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Tue, 10 Sep 2024 07:29:02 +0000
Subject: [PATCH 16/29] [frontend] Fix implementation error in permute and
 conv_2d operation

---
 examples/BuddyTest/CMakeLists.txt |  4 +-
 examples/BuddyTest/makefile       | 20 +++++-----
 frontend/Python/ops/gpu.py        | 61 ++++++++++++++++---------------
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt
index 2e3654b347..8039bfcc15 100644
--- a/examples/BuddyTest/CMakeLists.txt
+++ b/examples/BuddyTest/CMakeLists.txt
@@ -7,9 +7,9 @@ add_custom_command(
 
 add_custom_command(
   OUTPUT forward.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers |
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm |
           ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
-            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" |
+            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
           ${LLVM_MLIR_BINARY_DIR}/llvm-as |
           ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o
diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile
index 02aba04064..9c4c2e4a0c 100644
--- a/examples/BuddyTest/makefile
+++ b/examples/BuddyTest/makefile
@@ -20,27 +20,27 @@ MTRIPLE := x86_64-apple-darwin
 endif
 
 gpu-test-lower:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
 	${MLIR_OPT} -o log.mlir
 
 gpu-test-translate:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
 	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
 
 gpu-test-run:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
 	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
 
 gpu-conv2d-lower:
-	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
 	${MLIR_OPT} -o log.mlir
 
 gpu-conv2d-translate:
@@ -50,7 +50,7 @@ gpu-conv2d-translate:
 	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
 
 gpu-conv2d-run:
-	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
+	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
 	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
+	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
 	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index 042770a8b5..fcf6d1f398 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -21,7 +21,7 @@
 
 from typing import Tuple
 import mlir.ir as ir
-from mlir.dialects import gpu, memref, arith, scf
+from mlir.dialects import gpu, memref, arith, scf, vector
 
 from ..graph import TensorDType
 from ..graph import (
@@ -167,37 +167,38 @@ def permute_op(node: PermuteOp, symbol_table):
     operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
-    perm = node.args[1]
-    perm_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm))
+    perm_map = node.args[1]
+    perm_map_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm_map))
 
     output_shape = list(node.tensor_meta["shape"])
-    element_type = mlir_element_type_get(node.tensor_meta["dtype"])
-    input_shape = [0] * len(output_shape)
-    for i, p in enumerate(perm):
-        input_shape[p] = output_shape[i]
-
-    # Prepare input_stride and output_stride data
-    input_stride = []
-    stride = 1
-    for dim in reversed(input_shape):
-        input_stride.insert(0, stride)
-        stride *= dim
-    output_stride = [input_stride[i] for i in perm]
-
-    offset = 0
-    result_type = ir.MemRefType.get(
-        shape=output_shape,
-        element_type=element_type,
-        layout=ir.StridedLayoutAttr.get(offset, output_stride)
+    dtype = node.tensor_meta["dtype"]
+    
+    element_type = mlir_element_type_get(dtype)
+    element_attr = mlir_element_attr_get(dtype, 0.0)
+    
+    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    f0 = arith.ConstantOp(element_type, element_attr)
+
+    v0 = vector.transfer_read(
+        vector=ir.VectorType.get(output_shape, element_type),
+        source=input1,
+        indices=[c0]*len(output_shape),
+        permutation_map=perm_map_attr,
+        padding=f0
     )
-    permute_op = memref.TransposeOp(
-        result=result_type,
-        in_=input1,
-        permutation=perm_attr
-    )    
-    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-    memref.CopyOp(permute_op, output)
-    return output
+    
+    transpose = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+
+    vector.transfer_write(
+        result=None,
+        vector=v0,
+        source=transpose,
+        indices=[c0]*len(output_shape),
+        permutation_map=ir.AffineMapAttr.get(
+            ir.AffineMap.get_permutation([i for i in range(len(output_shape))])
+        )
+    )
+    return transpose
 
 
 # TODO: Consider the cases where the arguments take different values.
@@ -246,7 +247,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
 
     batch_size = input_shape[0]
     in_channels = input_shape[1]
-    out_channels = output_shape[0]
+    out_channels = output_shape[1]
     in_size_h = input_shape[2]
     in_size_w = input_shape[3]
     out_size_h = output_shape[2]

From cf703c7246b0d6a685de871d833c0f1b41232636 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Wed, 18 Sep 2024 13:39:14 +0000
Subject: [PATCH 17/29] [frontend] Add LeNet example for E2E execution in GPU
 device

---
 examples/BuddyLeNet/CMakeLists.txt          |  73 +++++---
 examples/BuddyLeNet/buddy-lenet-import.py   |   4 +-
 frontend/Python/graph/graph.py              |  28 ++-
 frontend/Python/graph/graph_driver.py       |   7 +-
 frontend/Python/graph/transform/fuse_ops.py |   2 +-
 frontend/Python/ops/gpu.py                  | 179 +++++++++++++++++++-
 frontend/Python/ops/tosa.py                 | 108 ++++++++++--
 7 files changed, 349 insertions(+), 52 deletions(-)

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 9698f617bc..c10571f883 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -18,37 +18,54 @@ add_custom_command(
   VERBATIM)
 
 add_custom_command(
-  OUTPUT subgraph0.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
-            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
-          ${BUDDY_BINARY_DIR}/buddy-opt
-            -eliminate-empty-tensors
-            -convert-tensor-to-linalg 
-            -linalg-bufferize
-            -convert-linalg-to-affine-loops
-            -lower-affine
-            -func-bufferize-dynamic-offset
-            -arith-bufferize
-            -tensor-bufferize
-            -buffer-deallocation
-            -finalizing-bufferize
-            -convert-vector-to-scf
-            -expand-strided-metadata
-            -convert-vector-to-llvm
-            -convert-arith-to-llvm
-            -finalize-memref-to-llvm
-            -convert-scf-to-cf
-            -llvm-request-c-wrappers
-            -convert-arith-to-llvm
-            -convert-func-to-llvm
-            -reconcile-unrealized-casts | 
-          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-          ${LLVM_MLIR_BINARY_DIR}/llvm-as |
-          ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+  OUTPUT subgraph0.ll
+  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+  COMMENT "Building subgraph0.ll"
+  VERBATIM)
+  
+add_custom_command(
+  OUTPUT subgraph0.o
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+  DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
   COMMENT "Building subgraph0.o"
   VERBATIM)
 
+# add_custom_command(
+#   OUTPUT subgraph0.o
+#   COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
+#             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+#           ${BUDDY_BINARY_DIR}/buddy-opt
+#             -eliminate-empty-tensors
+#             -convert-tensor-to-linalg 
+#             -linalg-bufferize
+#             -convert-linalg-to-affine-loops
+#             -lower-affine
+#             -func-bufferize-dynamic-offset
+#             -arith-bufferize
+#             -tensor-bufferize
+#             -buffer-deallocation
+#             -finalizing-bufferize
+#             -convert-vector-to-scf
+#             -expand-strided-metadata
+#             -convert-vector-to-llvm
+#             -convert-arith-to-llvm
+#             -finalize-memref-to-llvm
+#             -convert-scf-to-cf
+#             -llvm-request-c-wrappers
+#             -convert-arith-to-llvm
+#             -convert-func-to-llvm
+#             -reconcile-unrealized-casts | 
+#           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+#           ${LLVM_MLIR_BINARY_DIR}/llvm-as |
+#           ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+#   COMMENT "Building subgraph0.o"
+#   VERBATIM)
+
 add_library(LENET STATIC subgraph0.o forward.o)
 
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
@@ -56,5 +73,5 @@ SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 add_executable(buddy-lenet-run buddy-lenet-main.cpp)
 target_link_directories(buddy-lenet-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
 
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS})
+set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime ${OpenCV_LIBS})
 target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 95e76de253..65ef5127f3 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -28,7 +28,7 @@
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
 from buddy.compiler.graph.transform import simply_fuse
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import tosa, gpu
 from model import LeNet
 
 # Retrieve the LeNet model path from environment variables.
@@ -44,7 +44,7 @@
 
 # Initialize Dynamo Compiler with specific configurations as an importer.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=gpu.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 7c99b4391d..3ca8aa298a 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -26,6 +26,7 @@
 
 import mlir.ir as ir
 import mlir.dialects.func as func
+import mlir.dialects.bufferization as buffer
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir import runtime as rt
@@ -105,6 +106,7 @@ def __init__(
         fake_params: List[TensorMeta],
         ops_registry: dict,
         func_name: str,
+        device: DeviceType = DeviceType.GPU
     ) -> None:
         """
         Initializes the Graph.
@@ -123,7 +125,7 @@ def __init__(
         self._inputs = inputs
         self.node_table: Dict[str, Op] = {}
         self._fake_params = fake_params
-        self.device = "cpu"
+        self.device = device
         self._imported_module = None
         self._ops_registry = ops_registry
         self._func_name = func_name
@@ -174,7 +176,7 @@ def init_op_group(self):
                 continue
             group = [op]
             subgraph_name = "subgraph{}".format(i)
-            self.group_map_device[subgraph_name] = DeviceType.UNKNOW
+            self.group_map_device[subgraph_name] = DeviceType.GPU
             self.op_groups[subgraph_name] = group
 
     def fuse_ops(self, pattern_list: List[FunctionType]):
@@ -237,6 +239,8 @@ def lower_to_top_level_ir(self):
                 self._inputs,
                 self._func_name,
                 self._ops_registry,
+                False,
+                self.device
             )
             self._imported_module = fx_importer.import_graph()
             outputs = fx_importer.get_output_nodes()
@@ -347,6 +351,7 @@ def __init__(
         func_name: str,
         ops_registry: dict,
         do_param_pack: bool = False,
+        device: DeviceType = DeviceType.CPU,
     ):
         """
         Initializes the buddy Graph importer.
@@ -361,7 +366,7 @@ def __init__(
             ops_registry = {}
         self._symbol_table = {}
         self._body = body
-        self._device = DeviceType.GPU
+        self._device = device
         self._func_name = func_name
         self._params = params
         self._inputs = inputs
@@ -441,7 +446,7 @@ def import_graph(self) -> ir.Module:
                 shape_list = list(arg.shape)
                 dtype = arg.dtype
                 mlir_dtype = self._str_to_mlir_dtype(dtype)
-                tensor_arg = ir.MemRefType.get(shape_list, mlir_dtype)
+                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
                 arguments.append(tensor_arg)
             extern_func = []
             for node in self._body:
@@ -461,6 +466,11 @@ def generated_func(*args):
                             self._symbol_table.get((str(output_arg), 0))
                             for output_arg in output_node_args
                         ]
+                        # if self._device == DeviceType.GPU:
+                        #     returns = [
+                        #         buffer.to_tensor(ret)
+                        #         for ret in returns
+                        #     ]
                         self._symbol_table[("output", 0)] = returns
                     elif isinstance(node, PlaceholderOp):
                         self._import_placeholder(node, args_list)
@@ -577,6 +587,16 @@ def _import_placeholder(
         else:
             placeholder_name = args_list[self._num_input_visited]
 
+        # TODO : Consider converting arg type from RankedTensorType to MemRefType
+        if self._device == DeviceType.GPU:
+            placeholder_name = buffer.to_memref(
+                ir.MemRefType.get(
+                    list(node.tensor_meta.shape), 
+                    self._str_to_mlir_dtype(node.tensor_meta.dtype)
+                ),
+                placeholder_name
+            )
+
         self._symbol_table[(str(node.name), 0)] = placeholder_name
         self._num_input_visited += 1
 
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index 50a8869d5a..62a1239859 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -112,6 +112,7 @@ def build_subgraph_by_group(self):
         for subgraph_name in self._graph.op_groups.keys():
             subgraph_input = []
             subgraph_body = []
+            subgraph_device = self._graph.group_map_device[subgraph_name]
 
             # Construct input placeholder nodes
             for inp in subgraphs_inputs[subgraph_name]:
@@ -142,7 +143,11 @@ def build_subgraph_by_group(self):
 
             # Create subgraph and add it to the dictionary
             subgraph = Graph(
-                subgraph_input, [], self._graph._ops_registry, subgraph_name
+                subgraph_input,
+                [], 
+                self._graph._ops_registry, 
+                subgraph_name,
+                subgraph_device
             )
             subgraph.body = subgraph_body
             for op in subgraph_body:
diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py
index 61f6a5b54a..b3192653ad 100644
--- a/frontend/Python/graph/transform/fuse_ops.py
+++ b/frontend/Python/graph/transform/fuse_ops.py
@@ -39,7 +39,7 @@ def simply_fuse(graph: Graph):
     - None: Modifies the input graph in place.
     """
     new_op_group = []
-    device = DeviceType.UNKNOW
+    device = DeviceType.GPU
     for op in graph.body:
         if isinstance(op, PlaceholderOp):
             continue
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
index fcf6d1f398..9c8a5265e3 100644
--- a/frontend/Python/ops/gpu.py
+++ b/frontend/Python/ops/gpu.py
@@ -29,7 +29,8 @@
     ReshapeOp,
     PermuteOp,
     Conv2dOp,
-    MaxPool2dOp
+    MaxPool2dOp,
+    AddMMOp
 )
 from .utils import *
 
@@ -107,6 +108,8 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
             scf.YieldOp([])
         
         gpu.TerminatorOp()
+
+    gpu.HostUnregisterOp(input_cast)
     output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
     memref.CopyOp(input, output)
     return output
@@ -259,10 +262,12 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
     input_cast = memref.CastOp(unranked_memref_type, input_val)
     filter_cast = memref.CastOp(unranked_memref_type, filter_val)
+    bias_cast = memref.CastOp(unranked_memref_type, bias_val)
     output_cast = memref.CastOp(unranked_memref_type, output_val)
 
     gpu.HostRegisterOp(input_cast)
     gpu.HostRegisterOp(filter_cast)
+    gpu.HostRegisterOp(bias_cast)
     gpu.HostRegisterOp(output_cast)
 
     # Tile the input_val into Grids
@@ -392,6 +397,11 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
             scf.YieldOp([])
                 
         gpu.TerminatorOp()
+    
+    gpu.HostUnregisterOp(input_cast)
+    gpu.HostUnregisterOp(filter_cast)
+    gpu.HostUnregisterOp(bias_cast)
+    gpu.HostUnregisterOp(output_cast)
 
     return output_val
 
@@ -542,13 +552,178 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table):
                 
         gpu.TerminatorOp()
 
+    gpu.HostUnregisterOp(input_cast)
+    gpu.HostUnregisterOp(output_cast)
+
     return output_val
 
 
+def addmm_op(
+    node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation]
+):
+    dtype = node.tensor_meta["dtype"]
+    element_type = mlir_element_type_get(dtype)
+    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
+    kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
+
+    # TODO: Reverse the order of the mat2 before multiplication to optimize the cache hit rate
+
+    input_data = symbol_table.get((str(node.args[1]), 0), node.args[1])
+    weight = symbol_table.get((str(node.args[2]), 0), node.args[2])
+    bias = symbol_table.get((str(node.args[0]), 0), node.args[0])
+    # print("input_data: "+str(input_data))
+    # print("weight: "+str(weight))
+    # print("bias: "+str(bias))
+
+    # TODO: Transpose of the mat2 before multiplication to optimize the cache hit rate
+
+    output_shape = list(node.tensor_meta["shape"])
+    input_shape = input_data.type.shape
+    weight_shape = weight.type.shape
+    # print("output_shape: "+str(output_shape))
+    # print("output_shape: "+str())
+    # print("input_shape: "+str(input_shape))
+    # print("weight_shape: "+str(weight_shape))
+    # print("bias shape: "+str(bias.type.shape))
+
+    # Flatten the input into a one-dimensional format 
+    input_size = tensor_shape_size(input_shape)
+    weight_size = tensor_shape_size(weight_shape)
+    output_size = tensor_shape_size(output_shape)
+    # print("input_size: "+str(input_size))
+    # print("weight_size: "+str(weight_size))
+    # print("output_size: "+str(output_size))
+
+    input_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_size))
+    weight_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_size))
+    output_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size))
+    # print("input_size_c: "+str(input_size_c))
+    # print("weight_size_c: "+str(weight_size_c))
+    # print("output_size_c: "+str(output_size_c))
+
+    input_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
+    weight_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
+    bias_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
+    # print("input_shape_1d: "+str(input_shape_1d))
+    # print("weight_shape_1d: "+str(weight_shape_1d))
+    # print("bias_shape_1d: "+str(bias_shape_1d))
+
+    memref.StoreOp(input_size_c, input_shape_1d, [c0])
+    memref.StoreOp(weight_size_c, weight_shape_1d, [c0])
+    memref.StoreOp(output_size_c, bias_shape_1d, [c0])
+
+    input_reshape_type = ir.MemRefType.get([input_size], element_type)
+    weight_reshape_type = ir.MemRefType.get([weight_size], element_type)
+    bias_reshape_type = ir.MemRefType.get([output_size], element_type)
+    output_type = ir.MemRefType.get(output_shape, element_type)
+    # print("input_reshape_type: "+str(input_reshape_type))
+    # print("weight_reshape_type: "+str(weight_reshape_type))
+    # print("bias_reshape_type: "+str(bias_reshape_type))
+    # print("output_type: "+str(output_type))
+
+    input_reshape_1d = memref.ReshapeOp(input_reshape_type, input_data, input_shape_1d)
+    weight_reshape_1d = memref.ReshapeOp(weight_reshape_type, weight, weight_shape_1d)
+    bias_reshape_1d = memref.ReshapeOp(bias_reshape_type, bias, bias_shape_1d)
+    # print("input_reshape: "+str(input_reshape_1d))
+    # print("weight_reshape: "+str(weight_reshape_1d))
+    # print("bias_reshape: "+str(bias_reshape_1d))
+
+
+    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
+    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, input_reshape_1d))
+    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, weight_reshape_1d))
+    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, bias_reshape_1d))
+
+    row = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[0]))
+    col = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_shape[1]))
+    inner_dim = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[1]))
+
+    gpu_kernel = gpu.LaunchOp(
+        asyncToken=None,
+        asyncDependencies=[],
+        gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result,
+        blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result,
+    )
+    gpu_kernel_block = ir.Block.create_at_start(
+        gpu_kernel.body,
+        [
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_idx, block_idy, block_idz
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # thread_idx , thread_idy, thread_idz
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # grid_size x, grid_size y, grid_size z
+            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_size x, block_size y, block_size z
+        ]
+    )
+
+    # TODO: optimize to one dimension
+    with ir.InsertionPoint(gpu_kernel_block):
+        tIdX = gpu_kernel_block.arguments[3]
+        tIdY = gpu_kernel_block.arguments[4]
+        otter_loop = scf.ForOp(
+            lower_bound=tIdX,
+            upper_bound=row,
+            step=gpu_kernel.blockSizeX
+        )
+        with ir.InsertionPoint(otter_loop.body):
+            inner_loop = scf.ForOp(
+                lower_bound=tIdY,
+                upper_bound=col,
+                step=gpu_kernel.blockSizeY
+            )
+            with ir.InsertionPoint(inner_loop.body):
+                initial_sum = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0))
+
+                mul_loop = scf.ForOp(
+                    lower_bound=c0.result,
+                    upper_bound=inner_dim,
+                    step=c1.result,
+                    iter_args=[initial_sum]
+                )
+                with ir.InsertionPoint(mul_loop.body):
+                    sum = mul_loop.inner_iter_args[0]
+                    mat1_load = memref.LoadOp(input_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, inner_dim).result, mul_loop.induction_variable)])
+                    mat2_load = memref.LoadOp(weight_reshape_1d, [arith.AddIOp(arith.MulIOp(mul_loop.induction_variable, col).result, inner_loop.induction_variable)])
+                    res = arith.MulFOp(mat1_load, mat2_load)
+                    res = arith.AddFOp(sum, res)
+                    scf.YieldOp([res])
+                
+                sum = mul_loop.result
+                bias_load = memref.LoadOp(bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)])
+                res = arith.AddFOp(sum, bias_load)
+                memref.StoreOp(res, bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)])
+                scf.YieldOp([])
+            scf.YieldOp([])
+        gpu.TerminatorOp()
+
+
+    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
+
+    # FIXME: Dialect `memref' not found for custom op 'memref.expand_shape' 
+    # axis = ir.ArrayAttr.get(
+    #     [
+    #         ir.IntegerAttr.get(ir.IntegerType.get_signless(64), i)
+    #         for i in range(len(output_shape))
+    #     ],
+    #     None,
+    # )
+    # axis = ir.ArrayAttr.get([axis], None)
+    # bias_reshape = memref.ExpandShapeOp(output_type, bias, axis)
+
+    bias_shape = memref.AllocOp(ir.MemRefType.get([len(output_shape)], ir.IndexType.get()), [], [])
+    # print("bias_shape: "+str(bias_shape))
+    for i in range(len(output_shape)):
+        memref.StoreOp(arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_shape[i])), bias_shape, [arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))])
+
+    bias_reshape = memref.ReshapeOp(output_type, bias, bias_shape)
+    memref.CopyOp(bias_reshape, output)
+    return output
+
+
 ops_registry = {
     "ReluOp": relu_op,
     "ViewOp": reshape_op,
     "PermuteOp": permute_op,
     "Conv2dOp": convolution2d_op,
-    "MaxPool2dOp": maxpool2d_op
+    "MaxPool2dOp": maxpool2d_op,
+    "AddMMOp": addmm_op
 }
diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py
index 839ff268da..d9633cf7b4 100644
--- a/frontend/Python/ops/tosa.py
+++ b/frontend/Python/ops/tosa.py
@@ -1002,15 +1002,17 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     result_element_type = mlir_element_type_get(dtype)
     out_shape = node.tensor_meta["shape"]
 
+    # Prepare Depthwise Conv2D information
+    is_grouped = (list(weight_shape)[1] == 1) and (groups != 1)
+    is_depthwise = (groups == list(weight_shape)[0]) and is_grouped
+
     # Prepare input channel and output channel.
-    # TODO: confirm and modify this part.
     if is_kernel_transposed:
         in_channels = list(weight_shape)[0]
-        out_channels = list(weight_shape)[1]
+        out_channels = list(weight_shape)[1] * groups
     else:
-        in_channels = list(weight_shape)[1]
+        in_channels = list(weight_shape)[1] * groups
         out_channels = list(weight_shape)[0]
-    is_depthwise = (groups == in_channels) or (groups == out_channels)
 
     # Prepare bias tensor.
     if len(node._parents) == 2:
@@ -1025,20 +1027,19 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     else:
         bias_tensor = symbol_table.get((str(bias), 0))
 
-    # Prepare input padding.
-    if len(input_padding) == 1:
-        input_padding = [input_padding[0]] * 4
-    elif len(input_padding) == 2:
-        input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2
-
     # Prepare attributes.
-    input_padding_attr = ir._denseI64ArrayAttr(input_padding, None)
     dilation_attr = ir._denseI64ArrayAttr(dilation, None)
     stride_attr = ir._denseI64ArrayAttr(stride, None)
 
-    # TODO: Convolution 1D
     # Convolution 2D
     if len(weight_shape) == 4:
+        # Prepare input padding.
+        if len(input_padding) == 1:
+            input_padding = [input_padding[0]] * 4
+        elif len(input_padding) == 2:
+            input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2
+        # Prepare input_padding attributes.
+        input_padding_attr = ir._denseI64ArrayAttr(input_padding, None)
         # If the input layout is NCHW, then convert to NHWC.
         if node._layout.find("NCHW") != -1:
             perm_list = [0, 2, 3, 1]
@@ -1068,9 +1069,9 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
             out_shape = perm_shape
         output_type = ir.RankedTensorType.get(out_shape, result_element_type)
 
+        # Depthwise Conv2D Operation.
         if is_depthwise is True:
-            # Depthwise Conv2D Operation.
-            # TODO: the layout may lead misunderstanding
+            # If groups == in_channels,out_channels == in_channels
             if node._layout.find("FCHW") != -1:
                 perm_list = [2, 3, 0, 1]
                 perm_const_op = tosa.ConstOp(
@@ -1166,9 +1167,88 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
             op = tosa.TransposeOp(
                 permute_result_type, op.result, perm_const_op.results[0]
             )
+    # Convolution 1D
+    elif len(weight_shape) == 3:
+        # Prepare input with padding.
+        if input_padding[0] != 0:
+            input_shape = list(ir.RankedTensorType(input_val.type).shape)
+            padded_type = ir.RankedTensorType.get(
+                [
+                    input_shape[0],
+                    input_shape[1],
+                    input_shape[2] + 2 * input_padding[0],
+                ],
+                result_element_type,
+            )
+            pad_values_type = ir.RankedTensorType.get(
+                [3, 2], ir.IntegerType.get_signless(32)
+            )
+            pad_values = ir.DenseElementsAttr.get(
+                numpy.array(
+                    [[0, 0], [0, 0], [input_padding[0], input_padding[0]]],
+                    dtype=numpy.int32,
+                ),
+                type=pad_values_type,
+            )
+            pad_constant = arith.ConstantOp(pad_values_type, pad_values).result
+            input_val = tosa.PadOp(padded_type, input_val, pad_constant)
+        output_type = ir.RankedTensorType.get(out_shape, result_element_type)
+        output_conv = tensor.EmptyOp(list(out_shape), result_element_type)
+        assert groups == 1, "only support one group"
+        # Con1D Operation Without Bias
+        conv_op = linalg.conv_1d_ncw_fcw(
+            input_val,
+            weight_val,
+            outs=[output_conv],
+            strides=stride_attr,
+            dilations=dilation_attr,
+        )
+        output = tensor.EmptyOp(list(out_shape), result_element_type)
+        generic_map = ir.AffineMap.get_permutation(
+            [i for i in range(len(list(out_shape)))]
+        )
+        loop_type = [
+            ir.Attribute.parse("#linalg.iterator_type<parallel>")
+        ] * len(list(out_shape))
+        loop_type[1] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
+        # Add Bias To Conv2d.
+        op = linalg.GenericOp(
+            [output_type],
+            [conv_op, bias_tensor],
+            [output],
+            ir.ArrayAttr.get(
+                [
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(list(out_shape)))]
+                        )
+                    ),
+                    ir.AffineMapAttr.get(generic_map.get_submap([1])),
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(list(out_shape)))]
+                        )
+                    ),
+                ]
+            ),
+            ir.ArrayAttr.get(loop_type),
+        )
+        block = ir.Block.create_at_start(
+            op.region,
+            [
+                result_element_type,
+                ir.RankedTensorType(bias_tensor.type).element_type,
+                result_element_type,
+            ],
+        )
+        add_op = arith.AddFOp(block.arguments[1], block.arguments[0])
+        block.append(add_op)
+        block.append(linalg.YieldOp([add_op.result]))
+
     return op
 
 
+
 def relu_op(node: ReluOp, symbol_table):
     """
     Import the tensor relu operation.

From 9a88cb2e4e2cccfcff10ca989940df0401826316 Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Sat, 21 Sep 2024 02:58:35 +0000
Subject: [PATCH 18/29] [frontend] Add the custom subgraph partitioning
 interface

---
 examples/BuddyLeNet/CMakeLists.txt        |  23 +++-
 examples/BuddyLeNet/buddy-lenet-import.py |   4 +
 examples/BuddyLeNet/subgraph1.mlir        |  25 +++++
 frontend/Python/graph/graph.py            |  31 ++++--
 frontend/Python/graph/graph_driver.py     | 128 +++++++++++++++-------
 frontend/Python/graph/operation.py        |   4 +-
 frontend/Python/ops/func.py               |   4 +-
 7 files changed, 167 insertions(+), 52 deletions(-)
 create mode 100644 examples/BuddyLeNet/subgraph1.mlir

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index c10571f883..f391347466 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_custom_command(
-  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
+  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
   COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py
-  COMMENT "Generating forward.mlir, subgraph0.mlir and parameter files"
+  COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files"
 )
 
 add_custom_command(
@@ -33,6 +33,23 @@ add_custom_command(
   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
   COMMENT "Building subgraph0.o"
   VERBATIM)
+  
+add_custom_command(
+  OUTPUT subgraph1.ll
+  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+  COMMENT "Building subgraph1.ll"
+  VERBATIM)
+  
+add_custom_command(
+  OUTPUT subgraph1.o
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+  COMMENT "Building subgraph1.o"
+  VERBATIM)
 
 # add_custom_command(
 #   OUTPUT subgraph0.o
@@ -66,7 +83,7 @@ add_custom_command(
 #   COMMENT "Building subgraph0.o"
 #   VERBATIM)
 
-add_library(LENET STATIC subgraph0.o forward.o)
+add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
 
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 65ef5127f3..4b3160ea4e 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -59,10 +59,14 @@
 pattern_list = [simply_fuse]
 graphs[0].fuse_ops(pattern_list)
 driver = GraphDriver(graphs[0])
+print(len(driver.subgraphs))
 driver.subgraphs[0].lower_to_top_level_ir()
+driver.subgraphs[1].lower_to_top_level_ir()
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
+with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
+    print(driver.subgraphs[1]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
     print(driver.construct_main_graph(True), file=module_file)
 
diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir
new file mode 100644
index 0000000000..964dc7bbb8
--- /dev/null
+++ b/examples/BuddyLeNet/subgraph1.mlir
@@ -0,0 +1,25 @@
+#map = affine_map<(d0, d1) -> (d1, d0)>
+module attributes {gpu.container_module} {
+  func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>) {
+    %0 = bufferization.to_memref %arg0 : memref<120x256xf32>
+    %1 = bufferization.to_memref %arg1 : memref<84x120xf32>
+    %2 = bufferization.to_memref %arg2 : memref<10x84xf32>
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %3 = vector.transfer_read %0[%c0, %c0], %cst {permutation_map = #map} : memref<120x256xf32>, vector<256x120xf32>
+    %alloc = memref.alloc() : memref<256x120xf32>
+    vector.transfer_write %3, %alloc[%c0, %c0] : vector<256x120xf32>, memref<256x120xf32>
+    %c0_0 = arith.constant 0 : index
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %4 = vector.transfer_read %1[%c0_0, %c0_0], %cst_1 {permutation_map = #map} : memref<84x120xf32>, vector<120x84xf32>
+    %alloc_2 = memref.alloc() : memref<120x84xf32>
+    vector.transfer_write %4, %alloc_2[%c0_0, %c0_0] : vector<120x84xf32>, memref<120x84xf32>
+    %c0_3 = arith.constant 0 : index
+    %cst_4 = arith.constant 0.000000e+00 : f32
+    %5 = vector.transfer_read %2[%c0_3, %c0_3], %cst_4 {permutation_map = #map} : memref<10x84xf32>, vector<84x10xf32>
+    %alloc_5 = memref.alloc() : memref<84x10xf32>
+    vector.transfer_write %5, %alloc_5[%c0_3, %c0_3] : vector<84x10xf32>, memref<84x10xf32>
+    return %alloc, %alloc_2, %alloc_5 : memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>
+  }
+}
+
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 3ca8aa298a..b1c8666c38 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -171,13 +171,26 @@ def init_op_group(self):
         Returns:
         - None
         """
+        # for i, op in enumerate(self._body):
+        #     if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp):
+        #         continue
+        #     group = [op]
+        #     subgraph_name = "subgraph{}".format(i)
+        #     self.group_map_device[subgraph_name] = DeviceType.GPU
+        #     self.op_groups[subgraph_name] = group
+        group = []
         for i, op in enumerate(self._body):
-            if isinstance(op, PlaceholderOp):
+            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i==18 or i==21 or i==24:
                 continue
-            group = [op]
-            subgraph_name = "subgraph{}".format(i)
-            self.group_map_device[subgraph_name] = DeviceType.GPU
-            self.op_groups[subgraph_name] = group
+            group.append(op)
+        subgraph_name = "subgraph0"
+        self.group_map_device[subgraph_name] = DeviceType.GPU
+        self.op_groups[subgraph_name] = group
+        
+        new_group = [self._body[18], self._body[21], self._body[24]]
+        subgraph_name = "subgraph1"
+        self.group_map_device[subgraph_name] = DeviceType.GPU
+        self.op_groups[subgraph_name] = new_group
 
     def fuse_ops(self, pattern_list: List[FunctionType]):
         """
@@ -197,9 +210,9 @@ def fuse_ops(self, pattern_list: List[FunctionType]):
         # Initialize operation groups
         self.init_op_group()
 
-        # Apply fusion patterns
-        for pattern_func in pattern_list:
-            pattern_func(self)
+        # # Apply fusion patterns
+        # for pattern_func in pattern_list:
+        #     pattern_func(self)
 
     def perform(self, func_list: List[FunctionType]):
         """
@@ -541,7 +554,7 @@ def generated_func(*args):
                         ]
                     else:
                         self._import_op(node)
-
+                print(self._symbol_table)
                 return self._symbol_table.get(("output", 0))
 
         return self._module
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index 62a1239859..a8fbbf1f71 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -21,6 +21,7 @@
 # ===---------------------------------------------------------------------------
 
 from mlir import ir
+from collections import deque, defaultdict
 
 from .graph import Graph, GraphImporter, TensorMeta
 from .operation import FuncOp, CallOp, PlaceholderOp, OutputOp, GetItemOp
@@ -52,6 +53,11 @@ def __init__(self, graph: Graph) -> None:
         - None
         """
         self._graph = graph
+        self._subgraph_dependencies = { 
+            subgraph_name : set() 
+            for subgraph_name in list(self._graph.op_groups.keys()) 
+        }
+        self._call_table = {}
         (
             self._subgraphs,
             self._subgraphs_inputs,
@@ -95,13 +101,14 @@ def build_subgraph_by_group(self):
                 for arg in node.args:
                     output_node.append(arg)
         
-        # Identify outputs for each subgraph
+        # Identify outputs for each subgraph and build dependencies between subgraphs
         for subgraph_name in self._graph.op_groups.keys():
             subgraphs_outputs[subgraph_name] = []
             for op in self._graph.op_groups[subgraph_name]:
                 for key in subgraphs_inputs.keys():
                     if op.name in subgraphs_inputs[key]:
                         subgraphs_outputs[subgraph_name].append(op.name)
+                        self._subgraph_dependencies[subgraph_name].add(key)
                 if (op.name in output_node) and (
                     op.name not in subgraphs_outputs[subgraph_name]
                 ):
@@ -156,6 +163,38 @@ def build_subgraph_by_group(self):
 
         return subgraphs, subgraphs_inputs, subgraphs_outputs
 
+    def topological_sort_subgraph(self):
+        """
+        Performs topological sorting on the subgraphs based on their dependencies.
+
+        Args:
+        - graph (Graph): The graph from which subgraphs are constructed.
+
+        Returns:
+        - list: A list of subgraph names in topological order if the graph is acyclic; otherwise, None.
+        """
+
+        # Calculate in degree of each subgraph
+        in_degree = { subgraph_name : 0 for subgraph_name in list(self._subgraphs.keys()) }
+        for src, dests in self._subgraph_dependencies.items():
+            for dest in dests:
+                in_degree[dest] += 1
+
+        # Topological sorting 
+        queue = deque([node for node in in_degree if in_degree[node] == 0])
+        topo_order = []
+
+        while queue:
+            node = queue.popleft()
+            topo_order.append(node)
+            for child in self._subgraph_dependencies[node]:
+                in_degree[child] -= 1
+                if in_degree[child] == 0:
+                    queue.append(child)
+
+        # TODO: If the custom subgraph partitioning is illegal, further partition the subgraph to make it valid.
+        return topo_order if len(topo_order) == len(list(self._subgraphs.keys())) else None
+
     def construct_main_graph(self, do_param_pack=False):
         """
         Constructs the main computational graph by incorporating subgraphs' call
@@ -193,53 +232,68 @@ def construct_main_graph(self, do_param_pack=False):
                 func_node.tensor_meta["dtype"].append(
                     self._graph.node_table[output].tensor_meta["dtype"]
                 )
-            main_graph.body.append(func_node)
+            main_graph.add_node(func_node)
         
         # Adding placeholder operations from the original graph
         for op in self._graph.body:
             if isinstance(op, PlaceholderOp):
-                main_graph.body.append(op)
+                main_graph.add_node(op)
+            
+        # Analysis topology order to sort subgraph call.
+        topo_order = self.topological_sort_subgraph()
+        if topo_order ==  None:
+            print('Error : Graph Partitioning is illegal!')
+            return None
         
-        # TODO: analysis topology order to sort subgraph call.
-        if len(self._subgraphs) == 1:
-            # Adding CallOp to invoke the single subgraph
+        # Adding CallOp to invoke the single subgraph
+        for i, subgraph_name in enumerate(topo_order):
             call_node = CallOp()
-            call_node.name = "call0"
-            call_node.call_func_name = list(self._subgraphs.keys())[0]
+            call_node.name = "call{}".format(i)
+            call_node.call_func_name = subgraph_name
             call_node.tensor_meta = {"shape": [], "dtype": []}
-            for inp in list(self._subgraphs_inputs.values())[0]:
-                call_node.add_argument(inp)
-            for output in list(self._subgraphs_outputs.values())[0]:
+            for inp in self._subgraphs_inputs[subgraph_name]:
+                if inp in main_graph.node_table:
+                    call_node.add_argument(inp)
+                    continue
+                for key, value in self._subgraphs_outputs.items():
+                    if inp in value:
+                        call_node.add_argument(
+                            arg=self._call_table[key].name,
+                            arg_index=value.index(inp)
+                        )
+                        break
+            for output in self._subgraphs_outputs[subgraph_name]:
                 call_node.tensor_meta["shape"].append(
                     self._graph.node_table[output].tensor_meta["shape"]
                 )
                 call_node.tensor_meta["dtype"].append(
                     self._graph.node_table[output].tensor_meta["dtype"]
                 )
-            main_graph.body.append(call_node)
+            self._call_table[subgraph_name] = call_node
+            main_graph.add_node(call_node)
 
-            # Adding GetItemOps to retrieve individual output tensors
-            output_node = OutputOp()
-            for i, output in enumerate(list(self._subgraphs_outputs.values())[0]):
-                getitem_node = GetItemOp()
-                getitem_node.add_argument(call_node.name)
-                getitem_node.add_argument(i)
-                getitem_node.name = "getitem{}".format(i)
-                output_node.add_argument(getitem_node.name)
-                main_graph.body.append(getitem_node)
-            
-            # Marking the final output of the main graph
-            output_node.name = "output"
-            main_graph.body.append(output_node)
-
-            # Importing the main graph
-            with ir.Location.unknown(ir.Context()):
-                main_importer = GraphImporter(
-                    main_graph.body,
-                    main_graph._fake_params,
-                    main_graph._inputs,
-                    main_graph._func_name,
-                    main_graph._ops_registry,
-                    do_param_pack,
-                )
-                return main_importer.import_main_graph()
+        # Adding GetItemOps to retrieve individual output tensors
+        output_node = OutputOp()
+        for i, output in enumerate(self._subgraphs_outputs[topo_order[-1]]):
+            getitem_node = GetItemOp()
+            getitem_node.add_argument(call_node.name)
+            getitem_node.add_argument(i)
+            getitem_node.name = "getitem{}".format(i)
+            output_node.add_argument(getitem_node.name)
+            main_graph.add_node(getitem_node)
+        
+        # Marking the final output of the main graph
+        output_node.name = "output"
+        main_graph.add_node(output_node)
+
+        # Importing the main graph
+        with ir.Location.unknown(ir.Context()):
+            main_importer = GraphImporter(
+                main_graph.body,
+                main_graph._fake_params,
+                main_graph._inputs,
+                main_graph._func_name,
+                main_graph._ops_registry,
+                do_param_pack,
+            )
+            return main_importer.import_main_graph()
diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
index 14bfbf2752..c2dc186a39 100644
--- a/frontend/Python/graph/operation.py
+++ b/frontend/Python/graph/operation.py
@@ -81,13 +81,14 @@ def __init__(self) -> None:
         """
         self._name = None
         self._arguments = []
+        self._args_index = []
         self._keyword_arguments = {}
         self._tensor_meta: Dict = {}
         self._op_type: OpType = None
         self._children: List[str] = []
         self._parents: List[str] = []
 
-    def add_argument(self, arg):
+    def add_argument(self, arg, arg_index=0):
         """
         Add an input argument to the operation node.
 
@@ -96,6 +97,7 @@ def add_argument(self, arg):
             The input argument to be added.
         """
         self._arguments.append(arg)
+        self._args_index.append(arg_index)
 
     def add_parent(self, parent: str):
         """
diff --git a/frontend/Python/ops/func.py b/frontend/Python/ops/func.py
index a7dcc5e11b..e885809d82 100644
--- a/frontend/Python/ops/func.py
+++ b/frontend/Python/ops/func.py
@@ -59,8 +59,8 @@ def call_op(node: CallOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
     From Buddy CallOp to MLIR FUNC call operation.
     """
     arguments = []
-    for arg in node.args:
-        input_node = symbol_table.get((str(arg), 0))
+    for i, arg in enumerate(node.args):
+        input_node = symbol_table.get((str(arg), node._args_index[i]))
         memref_type = ir.MemRefType(input_node.type)
         stride = []
         shape = memref_type.shape

From 2f91175702db755addbc0fb013a8e2a23d40901a Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Sun, 22 Sep 2024 10:37:14 +0000
Subject: [PATCH 19/29] [frontend] Fix error in graph partitioning interface

---
 examples/BuddyLeNet/CMakeLists.txt        | 88 +++++++++++------------
 examples/BuddyLeNet/buddy-lenet-import.py |  4 +-
 examples/BuddyLeNet/subgraph1.mlir        | 31 +++-----
 frontend/Python/frontend.py               |  4 ++
 frontend/Python/graph/graph.py            | 30 +++++---
 frontend/Python/graph/graph_driver.py     |  3 +
 6 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index f391347466..7552f25f55 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -19,7 +19,7 @@ add_custom_command(
 
 add_custom_command(
   OUTPUT subgraph0.ll
-  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
+  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
           ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
@@ -34,55 +34,55 @@ add_custom_command(
   COMMENT "Building subgraph0.o"
   VERBATIM)
   
-add_custom_command(
-  OUTPUT subgraph1.ll
-  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
-            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
-  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
-  COMMENT "Building subgraph1.ll"
-  VERBATIM)
+# add_custom_command(
+#   OUTPUT subgraph1.ll
+#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
+#           ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+#             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
+#           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+#   COMMENT "Building subgraph1.ll"
+#   VERBATIM)
   
+# add_custom_command(
+#   OUTPUT subgraph1.o
+#   COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+#   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+#   COMMENT "Building subgraph1.o"
+#   VERBATIM)
+
 add_custom_command(
   OUTPUT subgraph1.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
-  DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${BUDDY_BINARY_DIR}/buddy-opt
+            -eliminate-empty-tensors
+            -convert-tensor-to-linalg 
+            -linalg-bufferize
+            -convert-linalg-to-affine-loops
+            -lower-affine
+            -func-bufferize-dynamic-offset
+            -arith-bufferize
+            -tensor-bufferize
+            -buffer-deallocation
+            -finalizing-bufferize
+            -convert-vector-to-scf
+            -expand-strided-metadata
+            -convert-vector-to-llvm
+            -convert-arith-to-llvm
+            -finalize-memref-to-llvm
+            -convert-scf-to-cf
+            -llvm-request-c-wrappers
+            -convert-arith-to-llvm
+            -convert-func-to-llvm
+            -reconcile-unrealized-casts | 
+          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_MLIR_BINARY_DIR}/llvm-as |
+          ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
   COMMENT "Building subgraph1.o"
   VERBATIM)
 
-# add_custom_command(
-#   OUTPUT subgraph0.o
-#   COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
-#             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
-#           ${BUDDY_BINARY_DIR}/buddy-opt
-#             -eliminate-empty-tensors
-#             -convert-tensor-to-linalg 
-#             -linalg-bufferize
-#             -convert-linalg-to-affine-loops
-#             -lower-affine
-#             -func-bufferize-dynamic-offset
-#             -arith-bufferize
-#             -tensor-bufferize
-#             -buffer-deallocation
-#             -finalizing-bufferize
-#             -convert-vector-to-scf
-#             -expand-strided-metadata
-#             -convert-vector-to-llvm
-#             -convert-arith-to-llvm
-#             -finalize-memref-to-llvm
-#             -convert-scf-to-cf
-#             -llvm-request-c-wrappers
-#             -convert-arith-to-llvm
-#             -convert-func-to-llvm
-#             -reconcile-unrealized-casts | 
-#           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-#           ${LLVM_MLIR_BINARY_DIR}/llvm-as |
-#           ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
-#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-#   COMMENT "Building subgraph0.o"
-#   VERBATIM)
-
 add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
 
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 4b3160ea4e..fa0883c8b9 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -44,7 +44,7 @@
 
 # Initialize Dynamo Compiler with specific configurations as an importer.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=gpu.ops_registry,
+    primary_registry=tosa.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
@@ -60,6 +60,8 @@
 graphs[0].fuse_ops(pattern_list)
 driver = GraphDriver(graphs[0])
 print(len(driver.subgraphs))
+print(driver.subgraphs[0].device)
+print(driver.subgraphs[1].device)
 driver.subgraphs[0].lower_to_top_level_ir()
 driver.subgraphs[1].lower_to_top_level_ir()
 path_prefix = os.path.dirname(os.path.abspath(__file__))
diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir
index 964dc7bbb8..a5d052b8d6 100644
--- a/examples/BuddyLeNet/subgraph1.mlir
+++ b/examples/BuddyLeNet/subgraph1.mlir
@@ -1,25 +1,12 @@
-#map = affine_map<(d0, d1) -> (d1, d0)>
-module attributes {gpu.container_module} {
-  func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>) {
-    %0 = bufferization.to_memref %arg0 : memref<120x256xf32>
-    %1 = bufferization.to_memref %arg1 : memref<84x120xf32>
-    %2 = bufferization.to_memref %arg2 : memref<10x84xf32>
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant 0.000000e+00 : f32
-    %3 = vector.transfer_read %0[%c0, %c0], %cst {permutation_map = #map} : memref<120x256xf32>, vector<256x120xf32>
-    %alloc = memref.alloc() : memref<256x120xf32>
-    vector.transfer_write %3, %alloc[%c0, %c0] : vector<256x120xf32>, memref<256x120xf32>
-    %c0_0 = arith.constant 0 : index
-    %cst_1 = arith.constant 0.000000e+00 : f32
-    %4 = vector.transfer_read %1[%c0_0, %c0_0], %cst_1 {permutation_map = #map} : memref<84x120xf32>, vector<120x84xf32>
-    %alloc_2 = memref.alloc() : memref<120x84xf32>
-    vector.transfer_write %4, %alloc_2[%c0_0, %c0_0] : vector<120x84xf32>, memref<120x84xf32>
-    %c0_3 = arith.constant 0 : index
-    %cst_4 = arith.constant 0.000000e+00 : f32
-    %5 = vector.transfer_read %2[%c0_3, %c0_3], %cst_4 {permutation_map = #map} : memref<10x84xf32>, vector<84x10xf32>
-    %alloc_5 = memref.alloc() : memref<84x10xf32>
-    vector.transfer_write %5, %alloc_5[%c0_3, %c0_3] : vector<84x10xf32>, memref<84x10xf32>
-    return %alloc, %alloc_2, %alloc_5 : memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>
+module {
+  func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (tensor<256x120xf32>, tensor<120x84xf32>, tensor<84x10xf32>) {
+    %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+    %1 = tosa.transpose %arg0, %0 : (tensor<120x256xf32>, tensor<2xi32>) -> tensor<256x120xf32>
+    %2 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+    %3 = tosa.transpose %arg1, %2 : (tensor<84x120xf32>, tensor<2xi32>) -> tensor<120x84xf32>
+    %4 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+    %5 = tosa.transpose %arg2, %4 : (tensor<10x84xf32>, tensor<2xi32>) -> tensor<84x10xf32>
+    return %1, %3, %5 : tensor<256x120xf32>, tensor<120x84xf32>, tensor<84x10xf32>
   }
 }
 
diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
index f30eb2a28a..dec046024f 100644
--- a/frontend/Python/frontend.py
+++ b/frontend/Python/frontend.py
@@ -42,6 +42,7 @@
 from .ops.tosa import ops_registry as tosa_ops_registry
 from .ops.math import ops_registry as math_ops_registry
 from .ops.func import ops_registry as func_ops_registry
+from .ops.gpu import ops_registry as gpu_ops_registry
 from .graph import Graph, TensorDType, TensorMeta
 from .graph.operation import *
 from .graph.transform import maxpool2d_simplify
@@ -98,12 +99,14 @@ def __init__(
         self._verbose = verbose
         self._imported_graphs = []
         self._ops_registry = {}
+        self._ops_gpu_registry = {}
         self._imported_params = {}
         self._ops_registry.update(math_ops_registry)
         self._ops_registry.update(linalg_ops_registry)
         self._ops_registry.update(tosa_ops_registry)
         self._ops_registry.update(func_ops_registry)
         self._ops_registry.update(primary_registry)
+        self._ops_gpu_registry.update(gpu_ops_registry)
         self._ops_map = {
             "output": OutputOp,
             "placeholder": PlaceholderOp,
@@ -276,6 +279,7 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
                 func_inputs,
                 fake_params,
                 self._ops_registry,
+                self._ops_gpu_registry,
                 self._func_name,
             )
             for gm_node in _gm.graph.nodes:
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index b1c8666c38..86867b4d4d 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -105,6 +105,7 @@ def __init__(
         inputs: List[TensorMeta],
         fake_params: List[TensorMeta],
         ops_registry: dict,
+        ops_gpu_registry: dict,
         func_name: str,
         device: DeviceType = DeviceType.GPU
     ) -> None:
@@ -128,6 +129,7 @@ def __init__(
         self.device = device
         self._imported_module = None
         self._ops_registry = ops_registry
+        self._ops_gpu_registry = ops_gpu_registry
         self._func_name = func_name
         self._ctx = ir.Context()
         self._output_memref = None
@@ -189,7 +191,7 @@ def init_op_group(self):
         
         new_group = [self._body[18], self._body[21], self._body[24]]
         subgraph_name = "subgraph1"
-        self.group_map_device[subgraph_name] = DeviceType.GPU
+        self.group_map_device[subgraph_name] = DeviceType.CPU
         self.op_groups[subgraph_name] = new_group
 
     def fuse_ops(self, pattern_list: List[FunctionType]):
@@ -252,6 +254,7 @@ def lower_to_top_level_ir(self):
                 self._inputs,
                 self._func_name,
                 self._ops_registry,
+                self._ops_gpu_registry,
                 False,
                 self.device
             )
@@ -261,7 +264,7 @@ def lower_to_top_level_ir(self):
         output_ranks = []
         output_dtypes = []
         for out_node in outputs:
-            out_type = ir.MemRefType(out_node.type)
+            out_type = ir.RankedTensorType(out_node.type)
             shape = list(out_type.shape)
             dtype = out_type.element_type
             match str(dtype):
@@ -363,6 +366,7 @@ def __init__(
         inputs: List[TensorMeta],
         func_name: str,
         ops_registry: dict,
+        ops_gpu_registry: dict,
         do_param_pack: bool = False,
         device: DeviceType = DeviceType.CPU,
     ):
@@ -388,6 +392,7 @@ def __init__(
         self._num_input_visited = 0
         self._module = ir.Module.create()
         self._ops_registry = ops_registry
+        self._ops_gpu_registry = ops_gpu_registry
         self._current_param_pack_offset = None
 
     def _str_to_mlir_dtype(self, dtype: str) -> ir.Type:
@@ -479,11 +484,11 @@ def generated_func(*args):
                             self._symbol_table.get((str(output_arg), 0))
                             for output_arg in output_node_args
                         ]
-                        # if self._device == DeviceType.GPU:
-                        #     returns = [
-                        #         buffer.to_tensor(ret)
-                        #         for ret in returns
-                        #     ]
+                        if self._device == DeviceType.GPU:
+                            returns = [
+                                buffer.to_tensor(ret)
+                                for ret in returns
+                            ]
                         self._symbol_table[("output", 0)] = returns
                     elif isinstance(node, PlaceholderOp):
                         self._import_placeholder(node, args_list)
@@ -623,9 +628,14 @@ def _import_op(self, node: Op):
         """
         
         op_name = node.__class__.__name__
-        op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
-            self._ops_registry[op_name](node, self._symbol_table)
-        )
+        if self._device == DeviceType.CPU:
+            op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
+                self._ops_registry[op_name](node, self._symbol_table)
+            )
+        else:
+            op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
+                self._ops_gpu_registry[op_name](node, self._symbol_table)
+            )
         if isinstance(op_ret, tuple | List):
             for i, operation in enumerate(op_ret):
                 if isinstance(operation, ir.Operation) or isinstance(
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index a8fbbf1f71..8ff8966be2 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -153,6 +153,7 @@ def build_subgraph_by_group(self):
                 subgraph_input,
                 [], 
                 self._graph._ops_registry, 
+                self._graph._ops_gpu_registry,
                 subgraph_name,
                 subgraph_device
             )
@@ -215,6 +216,7 @@ def construct_main_graph(self, do_param_pack=False):
             self._graph._inputs,
             self._graph._fake_params,
             self._graph._ops_registry,
+            self._graph._ops_gpu_registry,
             self._graph._func_name,
         )
 
@@ -294,6 +296,7 @@ def construct_main_graph(self, do_param_pack=False):
                 main_graph._inputs,
                 main_graph._func_name,
                 main_graph._ops_registry,
+                main_graph._ops_gpu_registry,
                 do_param_pack,
             )
             return main_importer.import_main_graph()

From 29745ef8e100ed2003fdf4b3bd6dad61dbf3256c Mon Sep 17 00:00:00 2001
From: wdjyd <1014108056@qq.com>
Date: Thu, 26 Sep 2024 03:41:12 +0000
Subject: [PATCH 20/29] [frontend] Add JSON format interface for subgraph
 partitioning implementation

---
 examples/BuddyLeNet/graph.dot      |  56 ----
 examples/BuddyLeNet/lenet.json     |   1 -
 examples/BuddyLeNet/subgraph1.mlir | 516 -----------------------------
 thirdparty/mimalloc                |   1 -
 4 files changed, 574 deletions(-)
 delete mode 100644 examples/BuddyLeNet/graph.dot
 delete mode 100644 examples/BuddyLeNet/lenet.json
 delete mode 100644 examples/BuddyLeNet/subgraph1.mlir
 delete mode 160000 thirdparty/mimalloc

diff --git a/examples/BuddyLeNet/graph.dot b/examples/BuddyLeNet/graph.dot
deleted file mode 100644
index 04313d9e35..0000000000
--- a/examples/BuddyLeNet/graph.dot
+++ /dev/null
@@ -1,56 +0,0 @@
-// Buddy Graph
-digraph {
-	arg0_1 -> convolution
-	arg1_1 -> convolution
-	arg2_1 -> convolution_1
-	arg3_1 -> convolution_1
-	arg4_1 -> permute
-	arg5_1 -> addmm
-	arg6_1 -> permute_1
-	arg7_1 -> addmm_1
-	arg8_1 -> permute_2
-	arg9_1 -> addmm_2
-	arg10_1 -> convolution
-	convolution -> relu
-	relu -> max_pool2d
-	max_pool2d -> convolution_1
-	convolution_1 -> relu_1
-	relu_1 -> max_pool2d_1
-	max_pool2d_1 -> view
-	view -> addmm
-	permute -> addmm
-	addmm -> relu_2
-	relu_2 -> addmm_1
-	permute_1 -> addmm_1
-	addmm_1 -> relu_3
-	relu_3 -> addmm_2
-	permute_2 -> addmm_2
-	addmm_2 -> output
-	arg0_1 [fillcolor=white shape=ellipse style=filled]
-	arg1_1 [fillcolor=white shape=ellipse style=filled]
-	arg2_1 [fillcolor=white shape=ellipse style=filled]
-	arg3_1 [fillcolor=white shape=ellipse style=filled]
-	arg4_1 [fillcolor=white shape=ellipse style=filled]
-	arg5_1 [fillcolor=white shape=ellipse style=filled]
-	arg6_1 [fillcolor=white shape=ellipse style=filled]
-	arg7_1 [fillcolor=white shape=ellipse style=filled]
-	arg8_1 [fillcolor=white shape=ellipse style=filled]
-	arg9_1 [fillcolor=white shape=ellipse style=filled]
-	arg10_1 [fillcolor=white shape=ellipse style=filled]
-	convolution [fillcolor=deepskyblue shape=box style=filled]
-	relu [fillcolor=deepskyblue shape=box style=filled]
-	max_pool2d [fillcolor=red shape=box style=filled]
-	convolution_1 [fillcolor=deepskyblue shape=box style=filled]
-	relu_1 [fillcolor=deepskyblue shape=box style=filled]
-	max_pool2d_1 [fillcolor=red shape=box style=filled]
-	view [fillcolor=deepskyblue shape=box style=filled]
-	permute [fillcolor=deepskyblue shape=box style=filled]
-	addmm [fillcolor=deepskyblue shape=box style=filled]
-	relu_2 [fillcolor=deepskyblue shape=box style=filled]
-	permute_1 [fillcolor=deepskyblue shape=box style=filled]
-	addmm_1 [fillcolor=deepskyblue shape=box style=filled]
-	relu_3 [fillcolor=deepskyblue shape=box style=filled]
-	permute_2 [fillcolor=deepskyblue shape=box style=filled]
-	addmm_2 [fillcolor=deepskyblue shape=box style=filled]
-	output [fillcolor=white shape=ellipse style=filled]
-}
diff --git a/examples/BuddyLeNet/lenet.json b/examples/BuddyLeNet/lenet.json
deleted file mode 100644
index aa0ceb90af..0000000000
--- a/examples/BuddyLeNet/lenet.json
+++ /dev/null
@@ -1 +0,0 @@
-{"graph_name": "forward", "nodes": [{"name": "arg0_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6, 1, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg1_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg2_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16, 6, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg3_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg4_1", "children": ["permute"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 256], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg5_1", "children": ["addmm"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg6_1", "children": ["permute_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg7_1", "children": ["addmm_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg8_1", "children": ["permute_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10, 84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg9_1", "children": ["addmm_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg10_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 1, 28, 28], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}], "device": "cpu", "params": [{"shape": [6, 1, 5, 5], "dtype": "Float32"}, {"shape": [6], "dtype": "Float32"}, {"shape": [16, 6, 5, 5], "dtype": "Float32"}, {"shape": [16], "dtype": "Float32"}, {"shape": [120, 256], "dtype": "Float32"}, {"shape": [120], "dtype": "Float32"}, {"shape": [84, 120], "dtype": "Float32"}, {"shape": [84], "dtype": "Float32"}, {"shape": [10, 84], "dtype": "Float32"}, {"shape": [10], "dtype": "Float32"}], "inputs": [{"shape": [1, 1, 28, 28], "dtype": "Float32"}], "node_map_device": {"convolution": "gpu", "relu": "gpu", "max_pool2d": "gpu", "convolution_1": "gpu", "relu_1": "gpu", "max_pool2d_1": "gpu", "view": "gpu", "addmm": "gpu", "relu_2": "gpu", "addmm_1": "gpu", "relu_3": "gpu", "addmm_2": "gpu", "permute": "cpu", "permute_1": "cpu", "permute_2": "cpu"}}
\ No newline at end of file
diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir
deleted file mode 100644
index 918a5569cf..0000000000
--- a/examples/BuddyLeNet/subgraph1.mlir
+++ /dev/null
@@ -1,516 +0,0 @@
-module attributes {gpu.container_module} {
-  func.func @subgraph1(%arg0: tensor<1x1x28x28xf32>, %arg1: tensor<6x1x5x5xf32>, %arg2: tensor<6xf32>, %arg3: tensor<16x6x5x5xf32>, %arg4: tensor<16xf32>, %arg5: tensor<120xf32>, %arg6: tensor<256x120xf32>, %arg7: tensor<84xf32>, %arg8: tensor<120x84xf32>, %arg9: tensor<10xf32>, %arg10: tensor<84x10xf32>) -> tensor<1x10xf32> {
-    %0 = bufferization.to_memref %arg0 : memref<1x1x28x28xf32>
-    %1 = bufferization.to_memref %arg1 : memref<6x1x5x5xf32>
-    %2 = bufferization.to_memref %arg2 : memref<6xf32>
-    %3 = bufferization.to_memref %arg3 : memref<16x6x5x5xf32>
-    %4 = bufferization.to_memref %arg4 : memref<16xf32>
-    %5 = bufferization.to_memref %arg5 : memref<120xf32>
-    %6 = bufferization.to_memref %arg6 : memref<256x120xf32>
-    %7 = bufferization.to_memref %arg7 : memref<84xf32>
-    %8 = bufferization.to_memref %arg8 : memref<120x84xf32>
-    %9 = bufferization.to_memref %arg9 : memref<10xf32>
-    %10 = bufferization.to_memref %arg10 : memref<84x10xf32>
-    %alloc = memref.alloc() : memref<1x6x24x24xf32>
-    %cast = memref.cast %0 : memref<1x1x28x28xf32> to memref<*xf32>
-    %cast_0 = memref.cast %1 : memref<6x1x5x5xf32> to memref<*xf32>
-    %cast_1 = memref.cast %2 : memref<6xf32> to memref<*xf32>
-    %cast_2 = memref.cast %alloc : memref<1x6x24x24xf32> to memref<*xf32>
-    gpu.host_register %cast : memref<*xf32>
-    gpu.host_register %cast_0 : memref<*xf32>
-    gpu.host_register %cast_1 : memref<*xf32>
-    gpu.host_register %cast_2 : memref<*xf32>
-    %c1 = arith.constant 1 : index
-    %c1_3 = arith.constant 1 : index
-    %c6 = arith.constant 6 : index
-    %c4 = arith.constant 4 : index
-    %c16 = arith.constant 16 : index
-    %c5 = arith.constant 5 : index
-    %c5_4 = arith.constant 5 : index
-    %c0 = arith.constant 0 : index
-    %c1_5 = arith.constant 1 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1, %arg18 = %c6, %arg19 = %c4) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16, %arg21 = %c16, %arg22 = %c1_5) {
-      %c2 = arith.constant 2 : index
-      %12 = arith.divui %arg13, %c2 : index
-      %13 = arith.muli %12, %c16 : index
-      %14 = arith.addi %13, %arg14 : index
-      %15 = arith.remui %arg13, %c2 : index
-      %16 = arith.muli %15, %c16 : index
-      %17 = arith.addi %16, %arg15 : index
-      %c1_140 = arith.constant 1 : index
-      %c1_141 = arith.constant 1 : index
-      %18 = arith.muli %14, %c1_140 : index
-      %19 = arith.muli %17, %c1_141 : index
-      %c24 = arith.constant 24 : index
-      %c24_142 = arith.constant 24 : index
-      %20 = arith.cmpi ult, %14, %c24 : index
-      %21 = arith.cmpi ult, %17, %c24_142 : index
-      %22 = arith.andi %20, %21 : i1
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.if %22 {
-        %23 = scf.for %arg23 = %c0 to %c1_3 step %c1_5 iter_args(%arg24 = %cst) -> (f32) {
-          %26 = scf.for %arg25 = %c0 to %c5 step %c1_5 iter_args(%arg26 = %cst) -> (f32) {
-            %28 = scf.for %arg27 = %c0 to %c5_4 step %c1_5 iter_args(%arg28 = %cst) -> (f32) {
-              %30 = arith.addi %18, %arg25 : index
-              %31 = arith.addi %19, %arg27 : index
-              %32 = memref.load %0[%arg11, %arg23, %30, %31] : memref<1x1x28x28xf32>
-              %33 = memref.load %1[%arg12, %arg23, %arg25, %arg27] : memref<6x1x5x5xf32>
-              %34 = arith.mulf %32, %33 : f32
-              %35 = arith.addf %arg28, %34 : f32
-              scf.yield %35 : f32
-            }
-            %29 = arith.addf %28, %arg26 : f32
-            scf.yield %29 : f32
-          }
-          %27 = arith.addf %26, %arg24 : f32
-          scf.yield %27 : f32
-        }
-        %24 = memref.load %2[%arg12] : memref<6xf32>
-        %25 = arith.addf %23, %24 : f32
-        memref.store %25, %alloc[%arg11, %arg12, %14, %17] : memref<1x6x24x24xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast : memref<*xf32>
-    gpu.host_unregister %cast_0 : memref<*xf32>
-    gpu.host_unregister %cast_1 : memref<*xf32>
-    gpu.host_unregister %cast_2 : memref<*xf32>
-    %c0_6 = arith.constant 0 : index
-    %c1_7 = arith.constant 1 : index
-    %c512 = arith.constant 512 : index
-    %c3456 = arith.constant 3456 : index
-    %alloc_8 = memref.alloc() : memref<1xindex>
-    memref.store %c3456, %alloc_8[%c0_6] : memref<1xindex>
-    %reshape = memref.reshape %alloc(%alloc_8) : (memref<1x6x24x24xf32>, memref<1xindex>) -> memref<3456xf32>
-    %cast_9 = memref.cast %alloc : memref<1x6x24x24xf32> to memref<*xf32>
-    gpu.host_register %cast_9 : memref<*xf32>
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_7, %arg18 = %c1_7, %arg19 = %c1_7) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512, %arg21 = %c1_7, %arg22 = %c1_7) {
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.for %arg23 = %arg14 to %c3456 step %c512 {
-        %12 = memref.load %reshape[%arg23] : memref<3456xf32>
-        %13 = arith.maxnumf %12, %cst : f32
-        memref.store %13, %reshape[%arg23] : memref<3456xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_9 : memref<*xf32>
-    %alloc_10 = memref.alloc() : memref<1x6x24x24xf32>
-    memref.copy %alloc, %alloc_10 : memref<1x6x24x24xf32> to memref<1x6x24x24xf32>
-    %alloc_11 = memref.alloc() : memref<1x6x12x12xf32>
-    %cast_12 = memref.cast %alloc_10 : memref<1x6x24x24xf32> to memref<*xf32>
-    %cast_13 = memref.cast %alloc_11 : memref<1x6x12x12xf32> to memref<*xf32>
-    gpu.host_register %cast_12 : memref<*xf32>
-    gpu.host_register %cast_13 : memref<*xf32>
-    %c1_14 = arith.constant 1 : index
-    %c6_15 = arith.constant 6 : index
-    %c1_16 = arith.constant 1 : index
-    %c16_17 = arith.constant 16 : index
-    %c0_18 = arith.constant 0 : index
-    %c1_19 = arith.constant 1 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_14, %arg18 = %c6_15, %arg19 = %c1_16) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_17, %arg21 = %c16_17, %arg22 = %c1_19) {
-      %c1_140 = arith.constant 1 : index
-      %12 = arith.divui %arg13, %c1_140 : index
-      %13 = arith.muli %12, %c16_17 : index
-      %14 = arith.addi %13, %arg14 : index
-      %15 = arith.remui %arg13, %c1_140 : index
-      %16 = arith.muli %15, %c16_17 : index
-      %17 = arith.addi %16, %arg15 : index
-      %c2 = arith.constant 2 : index
-      %c2_141 = arith.constant 2 : index
-      %c2_142 = arith.constant 2 : index
-      %c2_143 = arith.constant 2 : index
-      %18 = arith.muli %14, %c2_142 : index
-      %19 = arith.muli %17, %c2_143 : index
-      %c12 = arith.constant 12 : index
-      %c12_144 = arith.constant 12 : index
-      %20 = arith.cmpi ult, %14, %c12 : index
-      %21 = arith.cmpi ult, %17, %c12_144 : index
-      %22 = arith.andi %20, %21 : i1
-      scf.if %22 {
-        %23 = memref.load %alloc_10[%arg11, %arg12, %18, %19] : memref<1x6x24x24xf32>
-        %24 = scf.for %arg23 = %c0_18 to %c2 step %c1_19 iter_args(%arg24 = %23) -> (f32) {
-          %25 = scf.for %arg25 = %c0_18 to %c2_141 step %c1_19 iter_args(%arg26 = %23) -> (f32) {
-            %27 = arith.addi %18, %arg23 : index
-            %28 = arith.addi %19, %arg25 : index
-            %29 = memref.load %alloc_10[%arg11, %arg12, %27, %28] : memref<1x6x24x24xf32>
-            %30 = arith.maxnumf %arg26, %29 : f32
-            scf.yield %30 : f32
-          }
-          %26 = arith.maxnumf %25, %arg24 : f32
-          scf.yield %26 : f32
-        }
-        memref.store %24, %alloc_11[%arg11, %arg12, %14, %17] : memref<1x6x12x12xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_12 : memref<*xf32>
-    gpu.host_unregister %cast_13 : memref<*xf32>
-    %alloc_20 = memref.alloc() : memref<1x16x8x8xf32>
-    %cast_21 = memref.cast %alloc_11 : memref<1x6x12x12xf32> to memref<*xf32>
-    %cast_22 = memref.cast %3 : memref<16x6x5x5xf32> to memref<*xf32>
-    %cast_23 = memref.cast %4 : memref<16xf32> to memref<*xf32>
-    %cast_24 = memref.cast %alloc_20 : memref<1x16x8x8xf32> to memref<*xf32>
-    gpu.host_register %cast_21 : memref<*xf32>
-    gpu.host_register %cast_22 : memref<*xf32>
-    gpu.host_register %cast_23 : memref<*xf32>
-    gpu.host_register %cast_24 : memref<*xf32>
-    %c1_25 = arith.constant 1 : index
-    %c6_26 = arith.constant 6 : index
-    %c16_27 = arith.constant 16 : index
-    %c1_28 = arith.constant 1 : index
-    %c16_29 = arith.constant 16 : index
-    %c5_30 = arith.constant 5 : index
-    %c5_31 = arith.constant 5 : index
-    %c0_32 = arith.constant 0 : index
-    %c1_33 = arith.constant 1 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_25, %arg18 = %c16_27, %arg19 = %c1_28) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_29, %arg21 = %c16_29, %arg22 = %c1_33) {
-      %c1_140 = arith.constant 1 : index
-      %12 = arith.divui %arg13, %c1_140 : index
-      %13 = arith.muli %12, %c16_29 : index
-      %14 = arith.addi %13, %arg14 : index
-      %15 = arith.remui %arg13, %c1_140 : index
-      %16 = arith.muli %15, %c16_29 : index
-      %17 = arith.addi %16, %arg15 : index
-      %c1_141 = arith.constant 1 : index
-      %c1_142 = arith.constant 1 : index
-      %18 = arith.muli %14, %c1_141 : index
-      %19 = arith.muli %17, %c1_142 : index
-      %c8 = arith.constant 8 : index
-      %c8_143 = arith.constant 8 : index
-      %20 = arith.cmpi ult, %14, %c8 : index
-      %21 = arith.cmpi ult, %17, %c8_143 : index
-      %22 = arith.andi %20, %21 : i1
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.if %22 {
-        %23 = scf.for %arg23 = %c0_32 to %c6_26 step %c1_33 iter_args(%arg24 = %cst) -> (f32) {
-          %26 = scf.for %arg25 = %c0_32 to %c5_30 step %c1_33 iter_args(%arg26 = %cst) -> (f32) {
-            %28 = scf.for %arg27 = %c0_32 to %c5_31 step %c1_33 iter_args(%arg28 = %cst) -> (f32) {
-              %30 = arith.addi %18, %arg25 : index
-              %31 = arith.addi %19, %arg27 : index
-              %32 = memref.load %alloc_11[%arg11, %arg23, %30, %31] : memref<1x6x12x12xf32>
-              %33 = memref.load %3[%arg12, %arg23, %arg25, %arg27] : memref<16x6x5x5xf32>
-              %34 = arith.mulf %32, %33 : f32
-              %35 = arith.addf %arg28, %34 : f32
-              scf.yield %35 : f32
-            }
-            %29 = arith.addf %28, %arg26 : f32
-            scf.yield %29 : f32
-          }
-          %27 = arith.addf %26, %arg24 : f32
-          scf.yield %27 : f32
-        }
-        %24 = memref.load %4[%arg12] : memref<16xf32>
-        %25 = arith.addf %23, %24 : f32
-        memref.store %25, %alloc_20[%arg11, %arg12, %14, %17] : memref<1x16x8x8xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_21 : memref<*xf32>
-    gpu.host_unregister %cast_22 : memref<*xf32>
-    gpu.host_unregister %cast_23 : memref<*xf32>
-    gpu.host_unregister %cast_24 : memref<*xf32>
-    %c0_34 = arith.constant 0 : index
-    %c1_35 = arith.constant 1 : index
-    %c512_36 = arith.constant 512 : index
-    %c1024 = arith.constant 1024 : index
-    %alloc_37 = memref.alloc() : memref<1xindex>
-    memref.store %c1024, %alloc_37[%c0_34] : memref<1xindex>
-    %reshape_38 = memref.reshape %alloc_20(%alloc_37) : (memref<1x16x8x8xf32>, memref<1xindex>) -> memref<1024xf32>
-    %cast_39 = memref.cast %alloc_20 : memref<1x16x8x8xf32> to memref<*xf32>
-    gpu.host_register %cast_39 : memref<*xf32>
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_35, %arg18 = %c1_35, %arg19 = %c1_35) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_36, %arg21 = %c1_35, %arg22 = %c1_35) {
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.for %arg23 = %arg14 to %c1024 step %c512_36 {
-        %12 = memref.load %reshape_38[%arg23] : memref<1024xf32>
-        %13 = arith.maxnumf %12, %cst : f32
-        memref.store %13, %reshape_38[%arg23] : memref<1024xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_39 : memref<*xf32>
-    %alloc_40 = memref.alloc() : memref<1x16x8x8xf32>
-    memref.copy %alloc_20, %alloc_40 : memref<1x16x8x8xf32> to memref<1x16x8x8xf32>
-    %alloc_41 = memref.alloc() : memref<1x16x4x4xf32>
-    %cast_42 = memref.cast %alloc_40 : memref<1x16x8x8xf32> to memref<*xf32>
-    %cast_43 = memref.cast %alloc_41 : memref<1x16x4x4xf32> to memref<*xf32>
-    gpu.host_register %cast_42 : memref<*xf32>
-    gpu.host_register %cast_43 : memref<*xf32>
-    %c1_44 = arith.constant 1 : index
-    %c16_45 = arith.constant 16 : index
-    %c1_46 = arith.constant 1 : index
-    %c16_47 = arith.constant 16 : index
-    %c0_48 = arith.constant 0 : index
-    %c1_49 = arith.constant 1 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_44, %arg18 = %c16_45, %arg19 = %c1_46) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_47, %arg21 = %c16_47, %arg22 = %c1_49) {
-      %c1_140 = arith.constant 1 : index
-      %12 = arith.divui %arg13, %c1_140 : index
-      %13 = arith.muli %12, %c16_47 : index
-      %14 = arith.addi %13, %arg14 : index
-      %15 = arith.remui %arg13, %c1_140 : index
-      %16 = arith.muli %15, %c16_47 : index
-      %17 = arith.addi %16, %arg15 : index
-      %c2 = arith.constant 2 : index
-      %c2_141 = arith.constant 2 : index
-      %c2_142 = arith.constant 2 : index
-      %c2_143 = arith.constant 2 : index
-      %18 = arith.muli %14, %c2_142 : index
-      %19 = arith.muli %17, %c2_143 : index
-      %c4_144 = arith.constant 4 : index
-      %c4_145 = arith.constant 4 : index
-      %20 = arith.cmpi ult, %14, %c4_144 : index
-      %21 = arith.cmpi ult, %17, %c4_145 : index
-      %22 = arith.andi %20, %21 : i1
-      scf.if %22 {
-        %23 = memref.load %alloc_40[%arg11, %arg12, %18, %19] : memref<1x16x8x8xf32>
-        %24 = scf.for %arg23 = %c0_48 to %c2 step %c1_49 iter_args(%arg24 = %23) -> (f32) {
-          %25 = scf.for %arg25 = %c0_48 to %c2_141 step %c1_49 iter_args(%arg26 = %23) -> (f32) {
-            %27 = arith.addi %18, %arg23 : index
-            %28 = arith.addi %19, %arg25 : index
-            %29 = memref.load %alloc_40[%arg11, %arg12, %27, %28] : memref<1x16x8x8xf32>
-            %30 = arith.maxnumf %arg26, %29 : f32
-            scf.yield %30 : f32
-          }
-          %26 = arith.maxnumf %25, %arg24 : f32
-          scf.yield %26 : f32
-        }
-        memref.store %24, %alloc_41[%arg11, %arg12, %14, %17] : memref<1x16x4x4xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_42 : memref<*xf32>
-    gpu.host_unregister %cast_43 : memref<*xf32>
-    %alloc_50 = memref.alloc() : memref<2xindex>
-    %c0_51 = arith.constant 0 : index
-    %c1_52 = arith.constant 1 : index
-    memref.store %c1_52, %alloc_50[%c0_51] : memref<2xindex>
-    %c1_53 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    memref.store %c256, %alloc_50[%c1_53] : memref<2xindex>
-    %reshape_54 = memref.reshape %alloc_41(%alloc_50) : (memref<1x16x4x4xf32>, memref<2xindex>) -> memref<1x256xf32>
-    %c0_55 = arith.constant 0 : index
-    %c1_56 = arith.constant 1 : index
-    %c512_57 = arith.constant 512 : index
-    %c256_58 = arith.constant 256 : index
-    %c30720 = arith.constant 30720 : index
-    %c120 = arith.constant 120 : index
-    %alloc_59 = memref.alloc() : memref<1xindex>
-    %alloc_60 = memref.alloc() : memref<1xindex>
-    %alloc_61 = memref.alloc() : memref<1xindex>
-    memref.store %c256_58, %alloc_59[%c0_55] : memref<1xindex>
-    memref.store %c30720, %alloc_60[%c0_55] : memref<1xindex>
-    memref.store %c120, %alloc_61[%c0_55] : memref<1xindex>
-    %reshape_62 = memref.reshape %reshape_54(%alloc_59) : (memref<1x256xf32>, memref<1xindex>) -> memref<256xf32>
-    %reshape_63 = memref.reshape %6(%alloc_60) : (memref<256x120xf32>, memref<1xindex>) -> memref<30720xf32>
-    %reshape_64 = memref.reshape %5(%alloc_61) : (memref<120xf32>, memref<1xindex>) -> memref<120xf32>
-    %cast_65 = memref.cast %reshape_62 : memref<256xf32> to memref<*xf32>
-    gpu.host_register %cast_65 : memref<*xf32>
-    %cast_66 = memref.cast %reshape_63 : memref<30720xf32> to memref<*xf32>
-    gpu.host_register %cast_66 : memref<*xf32>
-    %cast_67 = memref.cast %reshape_64 : memref<120xf32> to memref<*xf32>
-    gpu.host_register %cast_67 : memref<*xf32>
-    %c1_68 = arith.constant 1 : index
-    %c120_69 = arith.constant 120 : index
-    %c256_70 = arith.constant 256 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_56, %arg18 = %c1_56, %arg19 = %c1_56) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_57, %arg21 = %c1_56, %arg22 = %c1_56) {
-      scf.for %arg23 = %arg14 to %c1_68 step %c512_57 {
-        scf.for %arg24 = %arg15 to %c120_69 step %c1_56 {
-          %cst = arith.constant 0.000000e+00 : f32
-          %12 = scf.for %arg25 = %c0_55 to %c256_70 step %c1_56 iter_args(%arg26 = %cst) -> (f32) {
-            %19 = arith.muli %arg23, %c256_70 : index
-            %20 = arith.addi %19, %arg25 : index
-            %21 = memref.load %reshape_62[%20] : memref<256xf32>
-            %22 = arith.muli %arg25, %c120_69 : index
-            %23 = arith.addi %22, %arg24 : index
-            %24 = memref.load %reshape_63[%23] : memref<30720xf32>
-            %25 = arith.mulf %21, %24 : f32
-            %26 = arith.addf %arg26, %25 : f32
-            scf.yield %26 : f32
-          }
-          %13 = arith.muli %arg23, %c120_69 : index
-          %14 = arith.addi %13, %arg24 : index
-          %15 = memref.load %reshape_64[%14] : memref<120xf32>
-          %16 = arith.addf %12, %15 : f32
-          %17 = arith.muli %arg23, %c120_69 : index
-          %18 = arith.addi %17, %arg24 : index
-          memref.store %16, %reshape_64[%18] : memref<120xf32>
-        }
-      }
-      gpu.terminator
-    }
-    %alloc_71 = memref.alloc() : memref<1x120xf32>
-    %alloc_72 = memref.alloc() : memref<2xindex>
-    %c1_73 = arith.constant 1 : index
-    %c0_74 = arith.constant 0 : index
-    memref.store %c1_73, %alloc_72[%c0_74] : memref<2xindex>
-    %c120_75 = arith.constant 120 : index
-    %c1_76 = arith.constant 1 : index
-    memref.store %c120_75, %alloc_72[%c1_76] : memref<2xindex>
-    %reshape_77 = memref.reshape %5(%alloc_72) : (memref<120xf32>, memref<2xindex>) -> memref<1x120xf32>
-    memref.copy %reshape_77, %alloc_71 : memref<1x120xf32> to memref<1x120xf32>
-    %c0_78 = arith.constant 0 : index
-    %c1_79 = arith.constant 1 : index
-    %c512_80 = arith.constant 512 : index
-    %c120_81 = arith.constant 120 : index
-    %alloc_82 = memref.alloc() : memref<1xindex>
-    memref.store %c120_81, %alloc_82[%c0_78] : memref<1xindex>
-    %reshape_83 = memref.reshape %alloc_71(%alloc_82) : (memref<1x120xf32>, memref<1xindex>) -> memref<120xf32>
-    %cast_84 = memref.cast %alloc_71 : memref<1x120xf32> to memref<*xf32>
-    gpu.host_register %cast_84 : memref<*xf32>
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_79, %arg18 = %c1_79, %arg19 = %c1_79) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_80, %arg21 = %c1_79, %arg22 = %c1_79) {
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.for %arg23 = %arg14 to %c120_81 step %c512_80 {
-        %12 = memref.load %reshape_83[%arg23] : memref<120xf32>
-        %13 = arith.maxnumf %12, %cst : f32
-        memref.store %13, %reshape_83[%arg23] : memref<120xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_84 : memref<*xf32>
-    %alloc_85 = memref.alloc() : memref<1x120xf32>
-    memref.copy %alloc_71, %alloc_85 : memref<1x120xf32> to memref<1x120xf32>
-    %c0_86 = arith.constant 0 : index
-    %c1_87 = arith.constant 1 : index
-    %c512_88 = arith.constant 512 : index
-    %c120_89 = arith.constant 120 : index
-    %c10080 = arith.constant 10080 : index
-    %c84 = arith.constant 84 : index
-    %alloc_90 = memref.alloc() : memref<1xindex>
-    %alloc_91 = memref.alloc() : memref<1xindex>
-    %alloc_92 = memref.alloc() : memref<1xindex>
-    memref.store %c120_89, %alloc_90[%c0_86] : memref<1xindex>
-    memref.store %c10080, %alloc_91[%c0_86] : memref<1xindex>
-    memref.store %c84, %alloc_92[%c0_86] : memref<1xindex>
-    %reshape_93 = memref.reshape %alloc_85(%alloc_90) : (memref<1x120xf32>, memref<1xindex>) -> memref<120xf32>
-    %reshape_94 = memref.reshape %8(%alloc_91) : (memref<120x84xf32>, memref<1xindex>) -> memref<10080xf32>
-    %reshape_95 = memref.reshape %7(%alloc_92) : (memref<84xf32>, memref<1xindex>) -> memref<84xf32>
-    %cast_96 = memref.cast %reshape_93 : memref<120xf32> to memref<*xf32>
-    gpu.host_register %cast_96 : memref<*xf32>
-    %cast_97 = memref.cast %reshape_94 : memref<10080xf32> to memref<*xf32>
-    gpu.host_register %cast_97 : memref<*xf32>
-    %cast_98 = memref.cast %reshape_95 : memref<84xf32> to memref<*xf32>
-    gpu.host_register %cast_98 : memref<*xf32>
-    %c1_99 = arith.constant 1 : index
-    %c84_100 = arith.constant 84 : index
-    %c120_101 = arith.constant 120 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_87, %arg18 = %c1_87, %arg19 = %c1_87) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_88, %arg21 = %c1_87, %arg22 = %c1_87) {
-      scf.for %arg23 = %arg14 to %c1_99 step %c512_88 {
-        scf.for %arg24 = %arg15 to %c84_100 step %c1_87 {
-          %cst = arith.constant 0.000000e+00 : f32
-          %12 = scf.for %arg25 = %c0_86 to %c120_101 step %c1_87 iter_args(%arg26 = %cst) -> (f32) {
-            %19 = arith.muli %arg23, %c120_101 : index
-            %20 = arith.addi %19, %arg25 : index
-            %21 = memref.load %reshape_93[%20] : memref<120xf32>
-            %22 = arith.muli %arg25, %c84_100 : index
-            %23 = arith.addi %22, %arg24 : index
-            %24 = memref.load %reshape_94[%23] : memref<10080xf32>
-            %25 = arith.mulf %21, %24 : f32
-            %26 = arith.addf %arg26, %25 : f32
-            scf.yield %26 : f32
-          }
-          %13 = arith.muli %arg23, %c84_100 : index
-          %14 = arith.addi %13, %arg24 : index
-          %15 = memref.load %reshape_95[%14] : memref<84xf32>
-          %16 = arith.addf %12, %15 : f32
-          %17 = arith.muli %arg23, %c84_100 : index
-          %18 = arith.addi %17, %arg24 : index
-          memref.store %16, %reshape_95[%18] : memref<84xf32>
-        }
-      }
-      gpu.terminator
-    }
-    %alloc_102 = memref.alloc() : memref<1x84xf32>
-    %alloc_103 = memref.alloc() : memref<2xindex>
-    %c1_104 = arith.constant 1 : index
-    %c0_105 = arith.constant 0 : index
-    memref.store %c1_104, %alloc_103[%c0_105] : memref<2xindex>
-    %c84_106 = arith.constant 84 : index
-    %c1_107 = arith.constant 1 : index
-    memref.store %c84_106, %alloc_103[%c1_107] : memref<2xindex>
-    %reshape_108 = memref.reshape %7(%alloc_103) : (memref<84xf32>, memref<2xindex>) -> memref<1x84xf32>
-    memref.copy %reshape_108, %alloc_102 : memref<1x84xf32> to memref<1x84xf32>
-    %c0_109 = arith.constant 0 : index
-    %c1_110 = arith.constant 1 : index
-    %c512_111 = arith.constant 512 : index
-    %c84_112 = arith.constant 84 : index
-    %alloc_113 = memref.alloc() : memref<1xindex>
-    memref.store %c84_112, %alloc_113[%c0_109] : memref<1xindex>
-    %reshape_114 = memref.reshape %alloc_102(%alloc_113) : (memref<1x84xf32>, memref<1xindex>) -> memref<84xf32>
-    %cast_115 = memref.cast %alloc_102 : memref<1x84xf32> to memref<*xf32>
-    gpu.host_register %cast_115 : memref<*xf32>
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_110, %arg18 = %c1_110, %arg19 = %c1_110) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_111, %arg21 = %c1_110, %arg22 = %c1_110) {
-      %cst = arith.constant 0.000000e+00 : f32
-      scf.for %arg23 = %arg14 to %c84_112 step %c512_111 {
-        %12 = memref.load %reshape_114[%arg23] : memref<84xf32>
-        %13 = arith.maxnumf %12, %cst : f32
-        memref.store %13, %reshape_114[%arg23] : memref<84xf32>
-      }
-      gpu.terminator
-    }
-    gpu.host_unregister %cast_115 : memref<*xf32>
-    %alloc_116 = memref.alloc() : memref<1x84xf32>
-    memref.copy %alloc_102, %alloc_116 : memref<1x84xf32> to memref<1x84xf32>
-    %c0_117 = arith.constant 0 : index
-    %c1_118 = arith.constant 1 : index
-    %c512_119 = arith.constant 512 : index
-    %c84_120 = arith.constant 84 : index
-    %c840 = arith.constant 840 : index
-    %c10 = arith.constant 10 : index
-    %alloc_121 = memref.alloc() : memref<1xindex>
-    %alloc_122 = memref.alloc() : memref<1xindex>
-    %alloc_123 = memref.alloc() : memref<1xindex>
-    memref.store %c84_120, %alloc_121[%c0_117] : memref<1xindex>
-    memref.store %c840, %alloc_122[%c0_117] : memref<1xindex>
-    memref.store %c10, %alloc_123[%c0_117] : memref<1xindex>
-    %reshape_124 = memref.reshape %alloc_116(%alloc_121) : (memref<1x84xf32>, memref<1xindex>) -> memref<84xf32>
-    %reshape_125 = memref.reshape %10(%alloc_122) : (memref<84x10xf32>, memref<1xindex>) -> memref<840xf32>
-    %reshape_126 = memref.reshape %9(%alloc_123) : (memref<10xf32>, memref<1xindex>) -> memref<10xf32>
-    %cast_127 = memref.cast %reshape_124 : memref<84xf32> to memref<*xf32>
-    gpu.host_register %cast_127 : memref<*xf32>
-    %cast_128 = memref.cast %reshape_125 : memref<840xf32> to memref<*xf32>
-    gpu.host_register %cast_128 : memref<*xf32>
-    %cast_129 = memref.cast %reshape_126 : memref<10xf32> to memref<*xf32>
-    gpu.host_register %cast_129 : memref<*xf32>
-    %c1_130 = arith.constant 1 : index
-    %c10_131 = arith.constant 10 : index
-    %c84_132 = arith.constant 84 : index
-    gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_118, %arg18 = %c1_118, %arg19 = %c1_118) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_119, %arg21 = %c1_118, %arg22 = %c1_118) {
-      scf.for %arg23 = %arg14 to %c1_130 step %c512_119 {
-        scf.for %arg24 = %arg15 to %c10_131 step %c1_118 {
-          %cst = arith.constant 0.000000e+00 : f32
-          %12 = scf.for %arg25 = %c0_117 to %c84_132 step %c1_118 iter_args(%arg26 = %cst) -> (f32) {
-            %19 = arith.muli %arg23, %c84_132 : index
-            %20 = arith.addi %19, %arg25 : index
-            %21 = memref.load %reshape_124[%20] : memref<84xf32>
-            %22 = arith.muli %arg25, %c10_131 : index
-            %23 = arith.addi %22, %arg24 : index
-            %24 = memref.load %reshape_125[%23] : memref<840xf32>
-            %25 = arith.mulf %21, %24 : f32
-            %26 = arith.addf %arg26, %25 : f32
-            scf.yield %26 : f32
-          }
-          %13 = arith.muli %arg23, %c10_131 : index
-          %14 = arith.addi %13, %arg24 : index
-          %15 = memref.load %reshape_126[%14] : memref<10xf32>
-          %16 = arith.addf %12, %15 : f32
-          %17 = arith.muli %arg23, %c10_131 : index
-          %18 = arith.addi %17, %arg24 : index
-          memref.store %16, %reshape_126[%18] : memref<10xf32>
-        }
-      }
-      gpu.terminator
-    }
-    %alloc_133 = memref.alloc() : memref<1x10xf32>
-    %alloc_134 = memref.alloc() : memref<2xindex>
-    %c1_135 = arith.constant 1 : index
-    %c0_136 = arith.constant 0 : index
-    memref.store %c1_135, %alloc_134[%c0_136] : memref<2xindex>
-    %c10_137 = arith.constant 10 : index
-    %c1_138 = arith.constant 1 : index
-    memref.store %c10_137, %alloc_134[%c1_138] : memref<2xindex>
-    %reshape_139 = memref.reshape %9(%alloc_134) : (memref<10xf32>, memref<2xindex>) -> memref<1x10xf32>
-    memref.copy %reshape_139, %alloc_133 : memref<1x10xf32> to memref<1x10xf32>
-    %11 = bufferization.to_tensor %alloc_133 : memref<1x10xf32>
-    return %11 : tensor<1x10xf32>
-  }
-}
-
diff --git a/thirdparty/mimalloc b/thirdparty/mimalloc
deleted file mode 160000
index 81a771161e..0000000000
--- a/thirdparty/mimalloc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 81a771161e37c8559c442fff099115cd1977db1e

From 75445899dcf80fb394205c2ae65684924a116e47 Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Sun, 29 Sep 2024 13:46:05 +0000
Subject: [PATCH 21/29] standby

---
 examples/BuddyLeNet/buddy-lenet-import.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 95e76de253..cae6924b18 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -63,6 +63,8 @@
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
+with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
+    print(driver.subgraphs[0]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
     print(driver.construct_main_graph(True), file=module_file)
 

From 82b92f8507a7c22ed4921f790da2b1bfdfeb13db Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Tue, 15 Oct 2024 07:59:07 +0000
Subject: [PATCH 22/29] The GPU OP-enabled version

---
 examples/BuddyLeNet/CMakeLists.txt        |  78 +++++-
 examples/BuddyLeNet/buddy-lenet-import.py |  14 +-
 examples/BuddyLeNet/makefile              |  17 ++
 examples/BuddyLeNet/matmul-cubin.mlir     |   3 +
 examples/BuddyLeNet/transform.mlir        | 311 ++++++++++++++++++++++
 frontend/Python/graph/graph.py            |  12 +-
 thirdparty/mimalloc                       |   1 +
 7 files changed, 408 insertions(+), 28 deletions(-)
 create mode 100644 examples/BuddyLeNet/matmul-cubin.mlir
 create mode 100644 examples/BuddyLeNet/transform.mlir
 create mode 160000 thirdparty/mimalloc

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 89c93a17d3..337c4342bd 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_custom_command(
-  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
+  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
   COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py
   COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files"
 )
@@ -17,43 +17,93 @@ add_custom_command(
   COMMENT "Building forward.o"
   VERBATIM)
 
+# add_custom_command(
+#   OUTPUT subgraph0.ll
+#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
+#           --transform-preload-library -transform-library-paths="transform.mlir" 
+#           --transform-interpreter -entry-point="codegen" |        
+#           ${BUDDY_BINARY_DIR}/buddy-opt --pass-pipeline "builtin.module(func.func(nvgpu-optimize-shared-memory))" |
+#           ${BUDDY_BINARY_DIR}/buddy-opt 
+#             -arith-expand 
+#             -eliminate-empty-tensors 
+#             -empty-tensor-to-alloc-tensor 
+#             -linalg-bufferize 
+#             -convert-linalg-to-affine-loops 
+#             -affine-loop-fusion 
+#             -affine-parallelize 
+#             -lower-affine 
+#             -canonicalize 
+#             -func-bufferize 
+#             -arith-bufferize 
+#             -tensor-bufferize 
+#             -buffer-deallocation 
+#             -finalizing-bufferize 
+#             -canonicalize
+#             -gpu-launch-sink-index-computations 
+#             -canonicalize 
+#             -legalize-shmem-outlining 
+#             -canonicalize
+#             -convert-memcpy-to-gpu
+#             -gpu-async-region
+#             -canonicalize
+#             -convert-scf-to-cf
+#             -memref-expand
+#             -finalize-memref-to-llvm
+#             -convert-arith-to-llvm
+#             -convert-vector-to-llvm
+#             -convert-gpu-to-nvvm
+#             -has-redux=1
+#             -llvm-request-c-wrappers 
+#             -canonicalize 
+#             -cse 
+#             -sccp |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
+#             --test-lower-to-nvvm -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
+#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+#   COMMENT "Building subgraph0.ll"
+#   VERBATIM
+#   )
+
 add_custom_command(
   OUTPUT subgraph0.ll
   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
   COMMENT "Building subgraph0.ll"
   VERBATIM)
   
 add_custom_command(
   OUTPUT subgraph0.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
   COMMENT "Building subgraph0.o"
   VERBATIM)
+
+
+  
   
 # add_custom_command(
 #   OUTPUT subgraph1.ll
 #   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-#           ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
 #             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-#           ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
 #   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
 #   COMMENT "Building subgraph1.ll"
 #   VERBATIM)
   
 # add_custom_command(
 #   OUTPUT subgraph1.o
-#   COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
 #   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
 #   COMMENT "Building subgraph1.o"
 #   VERBATIM)
 
 add_custom_command(
   OUTPUT subgraph1.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
           ${BUDDY_BINARY_DIR}/buddy-opt
             -eliminate-empty-tensors
@@ -77,11 +127,11 @@ add_custom_command(
             -convert-arith-to-llvm
             -convert-func-to-llvm
             -reconcile-unrealized-casts | 
-          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-          ${LLVM_MLIR_BINARY_DIR}/llvm-as |
-          ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
-  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-  COMMENT "Building subgraph0.o"
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+  COMMENT "Building subgraph1.o"
   VERBATIM)
 
 add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
@@ -93,5 +143,5 @@ target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
 
 set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES})
 
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS})
+set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES})
 target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 903a8b095b..c878b3b163 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -75,21 +75,19 @@
 
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
-with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
-    print(driver.subgraphs[0]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
     print(driver.subgraphs[1]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
     print(driver.construct_main_graph(True), file=module_file)
 
-# params = dynamo_compiler.imported_params[graph]
-# current_path = os.path.dirname(os.path.abspath(__file__))
+params = dynamo_compiler.imported_params[graph]
+current_path = os.path.dirname(os.path.abspath(__file__))
 
-# float32_param = np.concatenate(
-#     [param.detach().numpy().reshape([-1]) for param in params]
-# )
+float32_param = np.concatenate(
+    [param.detach().numpy().reshape([-1]) for param in params]
+)
 
-# float32_param.tofile(Path(current_path) / "arg0.data")
+float32_param.tofile(Path(current_path) / "arg0.data")
 
 # # Convert the lenet graph to JSON string
 # json_str = graph.to_json()
diff --git a/examples/BuddyLeNet/makefile b/examples/BuddyLeNet/makefile
index fe87b6da1a..f29fcf0769 100644
--- a/examples/BuddyLeNet/makefile
+++ b/examples/BuddyLeNet/makefile
@@ -20,6 +20,22 @@ MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.dylib
 MTRIPLE := x86_64-apple-darwin
 endif
 
+buddy-gpu-matmul-lower:
+	@${BUDDY_OPT} subgraph0.mlir \
+		-transform-preload-library="transform-library-paths=transform.mlir" \
+		-transform-interpreter="entry-point=codegen" \
+		-o log.mlir
+
+buddy-gpu-matmul:
+	@${BUDDY_OPT} subgraph0.mlir -transform-preload-library="transform-library-paths=transform.mlir" -transform-interpreter="entry-point=codegen" | \
+	${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \
+	${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \
+	${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \
+	${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \
+	${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
+	${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
+	${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir
+
 buddy-lenet-lower:
 	@${BUDDY_OPT} ./fake-lenet.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
@@ -124,3 +140,4 @@ buddy-lenet-opt-run:
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
diff --git a/examples/BuddyLeNet/matmul-cubin.mlir b/examples/BuddyLeNet/matmul-cubin.mlir
new file mode 100644
index 0000000000..0a1e515093
--- /dev/null
+++ b/examples/BuddyLeNet/matmul-cubin.mlir
@@ -0,0 +1,3 @@
+module {
+}
+
diff --git a/examples/BuddyLeNet/transform.mlir b/examples/BuddyLeNet/transform.mlir
new file mode 100644
index 0000000000..e2a02a9a97
--- /dev/null
+++ b/examples/BuddyLeNet/transform.mlir
@@ -0,0 +1,311 @@
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(%arg0: !transform.any_op) {
+    // Match the target operations and assign them to SSA values.
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0
+      : (!transform.any_op) -> !transform.any_op
+    %fill = transform.structured.match ops{["linalg.fill"]} in %arg0
+      : (!transform.any_op) -> !transform.any_op
+
+    // Perform tiling for the grid.
+    // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation
+    // strategy sets the tile size for grid-based partitioning to 128x256.
+    // This means that each [128, 2048] @ [2048, 256] matmul tile is computed within a GPU block,
+    // while multiple such blocks are computed in parallel across the grid.
+    // `tile_sizes` specify the dimensions of the tiled matmul result.
+    // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop.
+    // `%forall_op` is the `scf.forall` loop that maintains tile information.
+    %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul
+      tile_sizes [128, 256] (mapping = [#gpu.block<y>, #gpu.block<x>])
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Perform canonicalization.
+    %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %1 : !transform.any_op
+    %all_loops = transform.structured.match interface{LoopLikeInterface}
+        in %arg0
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops : !transform.any_op
+    transform.apply_patterns to %1 {
+    transform.apply_patterns.linalg.tiling_canonicalization
+    } : !transform.any_op
+
+    // Fuse the fill operation into the scf.all op.
+    %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Further tile the tiled matmul 
+    // Tile the third dimension in matmul.
+    // [128, 2048] @ [2048, 256] matmul is further tiled into [128, 16] @ [16, 256] matmul.
+    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_op [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Create pad op and prepare for mapping to GPU.
+    // Nothing has changed in the operation.
+    %padded, %pad, %copy = transform.structured.pad %tiled_linalg_op {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Rewrite tensor.pad into linalg.copy.
+    %3 = transform.get_producer_of_operand %padded[0] : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_producer_of_operand %padded[1] : (!transform.any_op) -> !transform.any_op
+    %5 = transform.get_producer_of_operand %padded[2] : (!transform.any_op) -> !transform.any_op
+    %6 = transform.structured.rewrite_in_destination_passing_style %3 : (!transform.any_op) -> !transform.any_op
+    %7 = transform.structured.rewrite_in_destination_passing_style %4 : (!transform.any_op) -> !transform.any_op
+    %8 = transform.structured.rewrite_in_destination_passing_style %5 : (!transform.any_op) -> !transform.any_op
+
+    // Tile the linalg.copy op and map it to GPU thread level,
+    // such that the tiled matrix are copied to GPU shared memory.
+    // num_threads is different from tile_sizes used above,
+    // as it specifies the number of tile instead of the size of the tile.
+    // The first transform tile the [128, 16] into [4, 4],
+    // and the second transform tile the [16, 256] into [2, 16].
+    %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %6 num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %tiled_op_2, %forall_op_3 = transform.structured.tile_using_forall %7 num_threads [8, 16](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Tile the linalg.matmul op and map it to GPU warp level.
+    %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %padded num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Tile the linalg.fill op and map it to GPU warp level.
+    %tiled_op_6, %forall_op_7 = transform.structured.tile_using_forall %fused_op num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Perform canonicalization.
+    %9 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %9 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %9 : !transform.any_op
+    %all_loops_2 = transform.structured.match interface{LoopLikeInterface}
+        in %9
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_2 : !transform.any_op
+    transform.apply_patterns to %9 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // Perform vectorization.
+    // Vectorize the linalg.copy, linalg.fill, and linalg.matmul operations.
+    %10 = transform.structured.vectorize_children_and_apply_patterns %9 : (!transform.any_op) -> !transform.any_op
+
+    // Perform canonicalization.
+    transform.apply_patterns to %10 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %10 : !transform.any_op
+    %all_loops_3 = transform.structured.match interface{LoopLikeInterface}
+        in %10
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_3 : !transform.any_op
+    transform.apply_patterns to %10 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // Match bufferization.alloc_tensors inside the forall op
+    %scf_forall = transform.structured.match ops{["scf.forall"]} attributes{mapping = [#gpu.block<y>, #gpu.block<x>]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %alloc_tensor_ops = transform.structured.match ops{["bufferization.alloc_tensor"]} in %scf_forall : (!transform.any_op) -> !transform.any_op
+
+    // Bufferize the alloc_tensor ops to memref.alloc ops.
+    // The memory_space attribute for GPU Dialect 0 means global memory, 3 means workgroup memory address, 5 means private memory address.
+    // According to https://discourse.llvm.org/t/rfc-memref-memory-shape-as-attribute/2229
+    %buffer, %new_ops = transform.structured.bufferize_to_allocation %alloc_tensor_ops {memory_space = 3 } : !transform.any_op
+
+    // Eliminate empty tensors and erase unnecessary inputs.
+    transform.structured.eliminate_empty_tensors %arg0 : !transform.any_op
+    %func_eras = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func_eras {
+        transform.apply_patterns.linalg.erase_unnecessary_inputs
+      } : !transform.any_op
+
+    // Bufferize the remaining operations in one time.
+    %11 = transform.bufferization.one_shot_bufferize %arg0 { bufferize_function_boundaries = true, function_boundary_type_conversion = 1 : i32} : (!transform.any_op) -> !transform.any_op
+
+    // Erase dead alloc and stores.
+    %12 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %12 : (!transform.any_op) -> ()
+
+    // Generate GPU launch.
+    %13 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+    %gpu_launch = transform.gpu.map_forall_to_blocks %13 { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
+
+    // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute.
+    %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op
+
+    %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+
+    // Removes unnecessary GPU barriers from the function.
+    // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
+
+    // Perform canonicalization.
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %15 : !transform.any_op
+    %all_loops_4 = transform.structured.match interface{LoopLikeInterface}
+        in %15
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_4 : !transform.any_op
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // Identify static memory allocations within the given region,
+    // and move them to a higher level (hoisting).
+    transform.buddy.hoist_static_alloc %15 : (!transform.any_op) -> ()
+
+    // Collects patterns for folding memref aliasing ops (memref.subview) into consumer load/store ops (affine.load, memref.load, nvgpu.ldmatrix, vector.load, vector.transfer_read, affine.store, memref.store, etc.) and other ops (e.g., memref.subview).
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    // Collects patterns for extracting address computations from operations with memory accesses such that these memory accesses use only a base pointer.
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.memref.extract_address_computations
+    } : !transform.any_op
+    // Perform canonicalization.
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %15 : !transform.any_op
+    %all_loops_5 = transform.structured.match interface{LoopLikeInterface}
+        in %15
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_5 : !transform.any_op
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // Adds patterns that unroll vectors to a native tile size for GPUs with mma operations
+    transform.apply_patterns to %15 {
+      transform.apply_patterns.buddy.unroll_vectors_gpu_mma_sync
+    } : !transform.any_op
+
+    // Insert a gpu.barrier after a given scf.for loop
+    %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for">
+    // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
+
+
+    transform.apply_patterns to %15 {
+                transform.apply_patterns.memref.fold_memref_alias_ops
+              } : !transform.any_op
+    transform.apply_cse to %15 : !transform.any_op
+
+    // Hoist vector.transfer_read / vector.transfer_write pairs out of immediately enclosing scf::ForOp iteratively
+    // Warning: Deprecated
+    %17 = transform.structured.hoist_redundant_vector_transfers %15 : (!transform.any_op) -> !transform.any_op
+
+    // Perform canonicalization.
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %17 : !transform.any_op
+    %all_loops_6 = transform.structured.match interface{LoopLikeInterface}
+        in %17
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_6 : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // This converts slices of operations containing vector.contract op into
+    // mma operations, targetting warp level tensorcore operations.
+    transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> ()
+
+    // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
+
+    // Perform canonicalization.
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %17 : !transform.any_op
+    %all_loops_7 = transform.structured.match interface{LoopLikeInterface}
+        in %17
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_7 : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op
+    %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc">
+
+    // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations.
+    transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op
+
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.vector.transfer_to_scf full_unroll = true
+    } : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %17 : !transform.any_op
+    %all_loops_8 = transform.structured.match interface{LoopLikeInterface}
+        in %17
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_8 : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.vector.lower_masked_transfers
+    } : !transform.any_op
+
+    // Convert sync copies to shared memory to async.
+    // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> ()
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+      transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    %all_loops_9 = transform.structured.match interface{LoopLikeInterface}
+        in %17
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_9 : !transform.any_op
+    transform.apply_cse to %17 : !transform.any_op
+
+
+    %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op
+    %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
+    // This applies software pipelining to a given scf.for loop.
+    // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop.
+    // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op
+
+    // Perform canonicalization.
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.vector.lower_masks
+    } : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.vector.materialize_masks
+    } : !transform.any_op
+    transform.apply_patterns to %17 {
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+      transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+
+    %all_loops_10 = transform.structured.match interface{LoopLikeInterface}
+        in %17
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_licm to %all_loops_10 : !transform.any_op
+    transform.apply_cse to %17 : !transform.any_op
+
+    transform.yield
+  }
+} // module
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 2f98949d7d..6a18f8b80d 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -186,16 +186,16 @@ def init_op_group(self):
         #     self.op_groups[subgraph_name] = group
         group = []
         for i, op in enumerate(self._body):
-            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i==18 or i==21 or i==24:
+            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25:
                 continue
             group.append(op)
-        subgraph_name = "subgraph0"
-        self.group_map_device[subgraph_name] = DeviceType.GPU
-        self.op_groups[subgraph_name] = group
-        
-        new_group = [self._body[18], self._body[21], self._body[24]]
         subgraph_name = "subgraph1"
         self.group_map_device[subgraph_name] = DeviceType.CPU
+        self.op_groups[subgraph_name] = group
+        
+        new_group = [self._body[25]]
+        subgraph_name = "subgraph0"
+        self.group_map_device[subgraph_name] = DeviceType.GPU
         self.op_groups[subgraph_name] = new_group
 
     def fuse_ops(self, pattern_list: List[FunctionType]):
diff --git a/thirdparty/mimalloc b/thirdparty/mimalloc
new file mode 160000
index 0000000000..81a771161e
--- /dev/null
+++ b/thirdparty/mimalloc
@@ -0,0 +1 @@
+Subproject commit 81a771161e37c8559c442fff099115cd1977db1e

From a8569cce3babf02de103eb43b85644fbaae3ceaf Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Tue, 29 Oct 2024 04:34:48 +0000
Subject: [PATCH 23/29] Separate for heterogeneous demo

---
 examples/BuddyLeNet/CMakeLists.txt            | 146 ++--
 examples/BuddyLeNet/matmul-cubin.mlir         |   3 -
 examples/BuddyLeNet/transform.mlir            | 311 --------
 examples/BuddyTest/.gitignore                 |   3 -
 examples/BuddyTest/CMakeLists.txt             |  29 -
 examples/BuddyTest/README.md                  |  65 --
 examples/BuddyTest/import-test.py             |  55 --
 examples/BuddyTest/makefile                   |  56 --
 examples/BuddyTest/model.py                   |  37 -
 examples/BuddyTest/test-main.cpp              | 115 ---
 examples/CMakeLists.txt                       |   4 -
 frontend/Python/frontend.py                   |   4 -
 frontend/Python/graph/graph.py                |  53 +-
 frontend/Python/graph/graph_driver.py         |   3 -
 frontend/Python/graph/json_decoder.py         |   2 -
 frontend/Python/ops/gpu.py                    | 729 ------------------
 .../Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp | 133 +---
 tests/Conversion/convert-memcpy-to-gpu.mlir   | 229 +++++-
 18 files changed, 333 insertions(+), 1644 deletions(-)
 delete mode 100644 examples/BuddyLeNet/matmul-cubin.mlir
 delete mode 100644 examples/BuddyLeNet/transform.mlir
 delete mode 100644 examples/BuddyTest/.gitignore
 delete mode 100644 examples/BuddyTest/CMakeLists.txt
 delete mode 100644 examples/BuddyTest/README.md
 delete mode 100644 examples/BuddyTest/import-test.py
 delete mode 100644 examples/BuddyTest/makefile
 delete mode 100644 examples/BuddyTest/model.py
 delete mode 100644 examples/BuddyTest/test-main.cpp
 delete mode 100644 frontend/Python/ops/gpu.py

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 337c4342bd..6e9cfe1204 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_custom_command(
   OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data
   COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py
-  COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files"
+  COMMENT "Generating forward.mlir, subgraph0.mlir, subgraph1.mlir and parameter files"
 )
 
 add_custom_command(
@@ -17,93 +17,9 @@ add_custom_command(
   COMMENT "Building forward.o"
   VERBATIM)
 
-# add_custom_command(
-#   OUTPUT subgraph0.ll
-#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
-#           --transform-preload-library -transform-library-paths="transform.mlir" 
-#           --transform-interpreter -entry-point="codegen" |        
-#           ${BUDDY_BINARY_DIR}/buddy-opt --pass-pipeline "builtin.module(func.func(nvgpu-optimize-shared-memory))" |
-#           ${BUDDY_BINARY_DIR}/buddy-opt 
-#             -arith-expand 
-#             -eliminate-empty-tensors 
-#             -empty-tensor-to-alloc-tensor 
-#             -linalg-bufferize 
-#             -convert-linalg-to-affine-loops 
-#             -affine-loop-fusion 
-#             -affine-parallelize 
-#             -lower-affine 
-#             -canonicalize 
-#             -func-bufferize 
-#             -arith-bufferize 
-#             -tensor-bufferize 
-#             -buffer-deallocation 
-#             -finalizing-bufferize 
-#             -canonicalize
-#             -gpu-launch-sink-index-computations 
-#             -canonicalize 
-#             -legalize-shmem-outlining 
-#             -canonicalize
-#             -convert-memcpy-to-gpu
-#             -gpu-async-region
-#             -canonicalize
-#             -convert-scf-to-cf
-#             -memref-expand
-#             -finalize-memref-to-llvm
-#             -convert-arith-to-llvm
-#             -convert-vector-to-llvm
-#             -convert-gpu-to-nvvm
-#             -has-redux=1
-#             -llvm-request-c-wrappers 
-#             -canonicalize 
-#             -cse 
-#             -sccp |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
-#             --test-lower-to-nvvm -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
-#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-#   COMMENT "Building subgraph0.ll"
-#   VERBATIM
-#   )
-
-add_custom_command(
-  OUTPUT subgraph0.ll
-  COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
-            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
-  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-  COMMENT "Building subgraph0.ll"
-  VERBATIM)
-  
 add_custom_command(
   OUTPUT subgraph0.o
-  COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
-  DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll
-  COMMENT "Building subgraph0.o"
-  VERBATIM)
-
-
-  
-  
-# add_custom_command(
-#   OUTPUT subgraph1.ll
-#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
-#             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
-#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
-#   COMMENT "Building subgraph1.ll"
-#   VERBATIM)
-  
-# add_custom_command(
-#   OUTPUT subgraph1.o
-#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
-#   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
-#   COMMENT "Building subgraph1.o"
-#   VERBATIM)
-
-add_custom_command(
-  OUTPUT subgraph1.o
-  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
           ${BUDDY_BINARY_DIR}/buddy-opt
             -eliminate-empty-tensors
@@ -129,11 +45,65 @@ add_custom_command(
             -reconcile-unrealized-casts | 
           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
           ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+  COMMENT "Building subgraph0.o"
+  VERBATIM)
+  
+# new
+set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
+set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
+add_custom_command(
+  OUTPUT subgraph1.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+          -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+          -buffer-deallocation
+          -convert-linalg-to-parallel-loops
+          -canonicalize
+          -gpu-map-parallel-loops
+          -convert-parallel-loops-to-gpu
+          -gpu-kernel-outlining
+          -canonicalize
+          -cse |
+          ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
           ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
   COMMENT "Building subgraph1.o"
   VERBATIM)
 
+# add_library(LENET_GPU STATIC subgraph0_gpu.o forward.o)
+
+# SET_TARGET_PROPERTIES(LENET_GPU PROPERTIES LINKER_LANGUAGE C)
+
+# add_executable(buddy-lenet-run-gpu buddy-lenet-main.cpp)
+# target_link_directories(buddy-lenet-run-gpu PRIVATE ${LLVM_LIBRARY_DIR})
+
+# set(BUDDY_LENET_LIBS_GPU LENET_GPU mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime ${PNG_LIBRARIES})
+
+# target_link_libraries(buddy-lenet-run-gpu ${BUDDY_LENET_LIBS_GPU})
+
+# add_custom_command(
+#   OUTPUT subgraph1.ll
+#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
+#             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+#   COMMENT "Building subgraph1.ll"
+#   VERBATIM)
+  
+# add_custom_command(
+#   OUTPUT subgraph1.o
+#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+#   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
+#   COMMENT "Building subgraph1.o"
+#   VERBATIM)
+
 add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
 
 SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
@@ -141,7 +111,5 @@ SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
 add_executable(buddy-lenet-run buddy-lenet-main.cpp)
 target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
 
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES})
-
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES})
+set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES})
 target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/matmul-cubin.mlir b/examples/BuddyLeNet/matmul-cubin.mlir
deleted file mode 100644
index 0a1e515093..0000000000
--- a/examples/BuddyLeNet/matmul-cubin.mlir
+++ /dev/null
@@ -1,3 +0,0 @@
-module {
-}
-
diff --git a/examples/BuddyLeNet/transform.mlir b/examples/BuddyLeNet/transform.mlir
deleted file mode 100644
index e2a02a9a97..0000000000
--- a/examples/BuddyLeNet/transform.mlir
+++ /dev/null
@@ -1,311 +0,0 @@
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(%arg0: !transform.any_op) {
-    // Match the target operations and assign them to SSA values.
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0
-      : (!transform.any_op) -> !transform.any_op
-    %fill = transform.structured.match ops{["linalg.fill"]} in %arg0
-      : (!transform.any_op) -> !transform.any_op
-
-    // Perform tiling for the grid.
-    // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation
-    // strategy sets the tile size for grid-based partitioning to 128x256.
-    // This means that each [128, 2048] @ [2048, 256] matmul tile is computed within a GPU block,
-    // while multiple such blocks are computed in parallel across the grid.
-    // `tile_sizes` specify the dimensions of the tiled matmul result.
-    // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop.
-    // `%forall_op` is the `scf.forall` loop that maintains tile information.
-    %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul
-      tile_sizes [128, 256] (mapping = [#gpu.block<y>, #gpu.block<x>])
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Perform canonicalization.
-    %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %1 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %1 : !transform.any_op
-    %all_loops = transform.structured.match interface{LoopLikeInterface}
-        in %arg0
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops : !transform.any_op
-    transform.apply_patterns to %1 {
-    transform.apply_patterns.linalg.tiling_canonicalization
-    } : !transform.any_op
-
-    // Fuse the fill operation into the scf.all op.
-    %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Further tile the tiled matmul 
-    // Tile the third dimension in matmul.
-    // [128, 2048] @ [2048, 256] matmul is further tiled into [128, 16] @ [16, 256] matmul.
-    %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_op [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Create pad op and prepare for mapping to GPU.
-    // Nothing has changed in the operation.
-    %padded, %pad, %copy = transform.structured.pad %tiled_linalg_op {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Rewrite tensor.pad into linalg.copy.
-    %3 = transform.get_producer_of_operand %padded[0] : (!transform.any_op) -> !transform.any_op
-    %4 = transform.get_producer_of_operand %padded[1] : (!transform.any_op) -> !transform.any_op
-    %5 = transform.get_producer_of_operand %padded[2] : (!transform.any_op) -> !transform.any_op
-    %6 = transform.structured.rewrite_in_destination_passing_style %3 : (!transform.any_op) -> !transform.any_op
-    %7 = transform.structured.rewrite_in_destination_passing_style %4 : (!transform.any_op) -> !transform.any_op
-    %8 = transform.structured.rewrite_in_destination_passing_style %5 : (!transform.any_op) -> !transform.any_op
-
-    // Tile the linalg.copy op and map it to GPU thread level,
-    // such that the tiled matrix are copied to GPU shared memory.
-    // num_threads is different from tile_sizes used above,
-    // as it specifies the number of tile instead of the size of the tile.
-    // The first transform tile the [128, 16] into [4, 4],
-    // and the second transform tile the [16, 256] into [2, 16].
-    %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %6 num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %tiled_op_2, %forall_op_3 = transform.structured.tile_using_forall %7 num_threads [8, 16](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Tile the linalg.matmul op and map it to GPU warp level.
-    %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %padded num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // Tile the linalg.fill op and map it to GPU warp level.
-    %tiled_op_6, %forall_op_7 = transform.structured.tile_using_forall %fused_op num_threads [2, 2](mapping = [#gpu.warp<y>, #gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Perform canonicalization.
-    %9 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %9 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %9 : !transform.any_op
-    %all_loops_2 = transform.structured.match interface{LoopLikeInterface}
-        in %9
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_2 : !transform.any_op
-    transform.apply_patterns to %9 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // Perform vectorization.
-    // Vectorize the linalg.copy, linalg.fill, and linalg.matmul operations.
-    %10 = transform.structured.vectorize_children_and_apply_patterns %9 : (!transform.any_op) -> !transform.any_op
-
-    // Perform canonicalization.
-    transform.apply_patterns to %10 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %10 : !transform.any_op
-    %all_loops_3 = transform.structured.match interface{LoopLikeInterface}
-        in %10
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_3 : !transform.any_op
-    transform.apply_patterns to %10 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // Match bufferization.alloc_tensors inside the forall op
-    %scf_forall = transform.structured.match ops{["scf.forall"]} attributes{mapping = [#gpu.block<y>, #gpu.block<x>]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %alloc_tensor_ops = transform.structured.match ops{["bufferization.alloc_tensor"]} in %scf_forall : (!transform.any_op) -> !transform.any_op
-
-    // Bufferize the alloc_tensor ops to memref.alloc ops.
-    // The memory_space attribute for GPU Dialect 0 means global memory, 3 means workgroup memory address, 5 means private memory address.
-    // According to https://discourse.llvm.org/t/rfc-memref-memory-shape-as-attribute/2229
-    %buffer, %new_ops = transform.structured.bufferize_to_allocation %alloc_tensor_ops {memory_space = 3 } : !transform.any_op
-
-    // Eliminate empty tensors and erase unnecessary inputs.
-    transform.structured.eliminate_empty_tensors %arg0 : !transform.any_op
-    %func_eras = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-      transform.apply_patterns to %func_eras {
-        transform.apply_patterns.linalg.erase_unnecessary_inputs
-      } : !transform.any_op
-
-    // Bufferize the remaining operations in one time.
-    %11 = transform.bufferization.one_shot_bufferize %arg0 { bufferize_function_boundaries = true, function_boundary_type_conversion = 1 : i32} : (!transform.any_op) -> !transform.any_op
-
-    // Erase dead alloc and stores.
-    %12 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
-    transform.memref.erase_dead_alloc_and_stores %12 : (!transform.any_op) -> ()
-
-    // Generate GPU launch.
-    %13 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
-    %gpu_launch = transform.gpu.map_forall_to_blocks %13 { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
-
-    // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute.
-    %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op
-
-    %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
-
-    // Removes unnecessary GPU barriers from the function.
-    // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
-
-    // Perform canonicalization.
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %15 : !transform.any_op
-    %all_loops_4 = transform.structured.match interface{LoopLikeInterface}
-        in %15
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_4 : !transform.any_op
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // Identify static memory allocations within the given region,
-    // and move them to a higher level (hoisting).
-    transform.buddy.hoist_static_alloc %15 : (!transform.any_op) -> ()
-
-    // Collects patterns for folding memref aliasing ops (memref.subview) into consumer load/store ops (affine.load, memref.load, nvgpu.ldmatrix, vector.load, vector.transfer_read, affine.store, memref.store, etc.) and other ops (e.g., memref.subview).
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    // Collects patterns for extracting address computations from operations with memory accesses such that these memory accesses use only a base pointer.
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.memref.extract_address_computations
-    } : !transform.any_op
-    // Perform canonicalization.
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %15 : !transform.any_op
-    %all_loops_5 = transform.structured.match interface{LoopLikeInterface}
-        in %15
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_5 : !transform.any_op
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // Adds patterns that unroll vectors to a native tile size for GPUs with mma operations
-    transform.apply_patterns to %15 {
-      transform.apply_patterns.buddy.unroll_vectors_gpu_mma_sync
-    } : !transform.any_op
-
-    // Insert a gpu.barrier after a given scf.for loop
-    %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for">
-    // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
-
-
-    transform.apply_patterns to %15 {
-                transform.apply_patterns.memref.fold_memref_alias_ops
-              } : !transform.any_op
-    transform.apply_cse to %15 : !transform.any_op
-
-    // Hoist vector.transfer_read / vector.transfer_write pairs out of immediately enclosing scf::ForOp iteratively
-    // Warning: Deprecated
-    %17 = transform.structured.hoist_redundant_vector_transfers %15 : (!transform.any_op) -> !transform.any_op
-
-    // Perform canonicalization.
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
-    %all_loops_6 = transform.structured.match interface{LoopLikeInterface}
-        in %17
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_6 : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // This converts slices of operations containing vector.contract op into
-    // mma operations, targetting warp level tensorcore operations.
-    transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> ()
-
-    // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
-
-    // Perform canonicalization.
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
-    %all_loops_7 = transform.structured.match interface{LoopLikeInterface}
-        in %17
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_7 : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op
-    %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc">
-
-    // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations.
-    transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op
-
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.vector.transfer_to_scf full_unroll = true
-    } : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
-    %all_loops_8 = transform.structured.match interface{LoopLikeInterface}
-        in %17
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_8 : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
-    // Convert sync copies to shared memory to async.
-    // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> ()
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-      transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    %all_loops_9 = transform.structured.match interface{LoopLikeInterface}
-        in %17
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_9 : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
-
-
-    %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op
-    %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
-    // This applies software pipelining to a given scf.for loop.
-    // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop.
-    // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op
-
-    // Perform canonicalization.
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.vector.lower_masks
-    } : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.vector.materialize_masks
-    } : !transform.any_op
-    transform.apply_patterns to %17 {
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-      transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-
-    %all_loops_10 = transform.structured.match interface{LoopLikeInterface}
-        in %17
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_licm to %all_loops_10 : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
-
-    transform.yield
-  }
-} // module
diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore
deleted file mode 100644
index 081f173509..0000000000
--- a/examples/BuddyTest/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-__pycache__
-*.mlir
-log.ll
diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt
deleted file mode 100644
index 8039bfcc15..0000000000
--- a/examples/BuddyTest/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-add_custom_command(
-  OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir
-  COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyTest/import-test.py
-  COMMENT "Generating forward.mlir"
-)
-
-
-add_custom_command(
-  OUTPUT forward.o
-  COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-opt 
-            -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-          ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-          ${LLVM_MLIR_BINARY_DIR}/llvm-as |
-          ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o
-  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir
-  COMMENT "Building forward.o"
-  VERBATIM)
-  
-  
-add_library(TEST STATIC forward.o)
-
-SET_TARGET_PROPERTIES(TEST PROPERTIES LINKER_LANGUAGE C)
-
-add_executable(buddy-test-run test-main.cpp)
-target_link_directories(buddy-test-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
-
-set(BUDDY_TEST_LIBS TEST mlir_runner_utils mlir_cuda_runtime)
-target_link_libraries(buddy-test-run ${BUDDY_TEST_LIBS})
diff --git a/examples/BuddyTest/README.md b/examples/BuddyTest/README.md
deleted file mode 100644
index f057723bb3..0000000000
--- a/examples/BuddyTest/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Buddy Compiler Test Example
-
-0. Activate your python environment.
-
-1. Build LLVM/MLIR
-
-```bash
-$ cd buddy-mlir
-$ mkdir llvm/build
-$ cd llvm/build
-$ cmake -G Ninja ../llvm \
-    -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \
-    -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \
-    -DMLIR_ENABLE_CUDA_RUNNER=ON \
-    -DLLVM_ENABLE_ASSERTIONS=ON \
-    -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
-    -DCMAKE_BUILD_TYPE=RELEASE \
-    -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-    -DPython3_EXECUTABLE=$(which python3)
-$ ninja check-clang check-mlir omp
-```
-
-2. Build buddy-mlir
-
-```bash
-$ mkdir build && cd build
-$ cmake -G Ninja .. \
-    -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
-    -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
-    -DLLVM_ENABLE_ASSERTIONS=ON \
-    -DCMAKE_BUILD_TYPE=RELEASE \
-    -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
-    -DPython3_EXECUTABLE=$(which python3) 
-$ ninja
-$ ninja check-buddy
-```
-
-3. Set the `PYTHONPATH` environment variable.
-
-Make sure you are in the build directory.
-
-```bash
-$ export BUDDY_MLIR_BUILD_DIR=$PWD
-$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
-$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
-```
-
-4. Build and run the Test example
-
-```bash
-$ cmake -G Ninja .. -DBUDDY_TEST_EXAMPLES=ON
-$ ninja buddy-test-run
-$ cd bin
-$ ./buddy-test-run
-```
-
-## Debug the Lowering Pass Pipeline with Fake Parameters.
-
-```bash
-$ cd buddy-mlir
-$ cd examples/BuddyTest
-$ make gpu-test-lower
-$ make gpu-test-translate
-$ make gpu-test-run
-```
diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py
deleted file mode 100644
index 79620d9d44..0000000000
--- a/examples/BuddyTest/import-test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# ===- buddy-lenet-import.py ---------------------------------------------------
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ===---------------------------------------------------------------------------
-#
-# This is the Test model AOT importer.
-#
-# ===---------------------------------------------------------------------------
-
-import os
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch._inductor.decomposition import decompositions as inductor_decomp
-
-from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
-from buddy.compiler.ops.gpu import ops_registry as gpu_ops_registry
-from model import TestModule
-
-model = TestModule()
-model = model.eval()
-
-# Initialize Dynamo Compiler with specific configurations as an importer.
-dynamo_compiler = DynamoCompiler(
-    primary_registry=gpu_ops_registry,
-    aot_autograd_decomposition=inductor_decomp,
-)
-
-data = torch.randn([1, 1, 12, 10])
-# Import the model into MLIR module and parameters.
-with torch.no_grad():
-    graphs = dynamo_compiler.importer(model, data)
-
-assert len(graphs) == 1
-graph = graphs[0]
-print(graph.body)
-graph.lower_to_top_level_ir()
-path_prefix = os.path.dirname(os.path.abspath(__file__))
-with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
-    print(graph._imported_module, file=module_file)
-      
\ No newline at end of file
diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile
deleted file mode 100644
index 9c4c2e4a0c..0000000000
--- a/examples/BuddyTest/makefile
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-BUDDY_OPT := ../../build/bin/buddy-opt
-MLIR_OPT := ../../llvm/build/bin/mlir-opt
-MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
-MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
-LLC := ../../llvm/build/bin/llc
-OPT_FLAG := -O0
-
-ifeq ($(shell uname),Linux)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
-MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so
-MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so
-MTRIPLE := x86_64-unknown-linux-gnu
-else ifeq ($(shell uname),Darwin)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
-MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib
-MTRIPLE := x86_64-apple-darwin
-endif
-
-gpu-test-lower:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
-	${MLIR_OPT} -o log.mlir
-
-gpu-test-translate:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
-	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
-
-gpu-test-run:
-	@${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
-	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
-
-gpu-conv2d-lower:
-	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
-	${MLIR_OPT} -o log.mlir
-
-gpu-conv2d-translate:
-	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \
-	${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
-
-gpu-conv2d-run:
-	@${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \
-	${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\
-	gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \
-	${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} 
diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py
deleted file mode 100644
index d72af61c95..0000000000
--- a/examples/BuddyTest/model.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# ===- model.py ----------------------------------------------------------------
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ===---------------------------------------------------------------------------
-#
-# Test model definition.
-#
-# ===---------------------------------------------------------------------------
-
-import torch
-import torch.nn as nn
-
-class TestModule(nn.Module):
-    def __init__(self):
-        super(TestModule, self).__init__()
-        self.conv1 = nn.Conv2d(1, 6, 5)
-        self.pool = nn.MaxPool2d(2, 2)
-        self.fc1 = nn.Linear(120,84)
-
-    def forward(self, x):
-        # x = self.conv1(x)
-        # x = self.pool(x)
-        x = x.view(-1, 120)
-        x = self.fc1(x)
-        return x
-    
diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp
deleted file mode 100644
index d1764bccd2..0000000000
--- a/examples/BuddyTest/test-main.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===- test-main.cpp ------------------------------------------------------===//
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-
-#include <buddy/Core/Container.h>
-#include <buddy/LLM/TextContainer.h>
-#include <filesystem>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace buddy;
-
-// extern "C" void
-// _mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *filter, MemRef<float, 1> *bias, MemRef<float, 4> *input);
-
-extern "C" void
-_mlir_ciface_forward(MemRef<float, 4> *result, MemRef<float, 4> *input);
-
-int main() {
-  /// Initialize data containers.
-  const int N = 1;
-  const int C = 1;
-  const int K = 1;
-  const int kernel_size = 2;
-  const int stride = 2;
-  const int H = 32;
-  const int W = 32;
-  const int H_out = H / kernel_size;
-  const int W_out = W / kernel_size;
-
-  MemRef<float, 4> input({N, C, H, W});  
-  // MemRef<float, 4> filter({K, C, kernel_size, kernel_size});  
-  // MemRef<float, 1> bias({K});  
-  MemRef<float, 4> result({N, C, H_out, W_out});
-
-  // Initial the input data
-  for (int n = 0; n < N; n++) { 
-    for (int c = 0; c < C; c++) {
-      for (int i = 0; i < H; i++) {
-        for (int j = 0; j < W; j++) {
-          int index = n * C * H * W + c * H * W + i * W + j;
-          input[index] = static_cast<float>((float)index/(H*W));
-        }
-      }
-    }
-  }
-  // for (int k = 0; k < K; k++) { 
-  //   for (int c = 0; c < C; c++) {
-  //     for (int i = 0; i < kernel_size; i++) {
-  //       for (int j = 0; j < kernel_size; j++) {
-  //         int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j;
-  //         filter[index] = static_cast<float>(1);
-  //       }
-  //     }
-  //   }
-  // }
-  
-  // for (int k = 0; k < K; k++) {
-  //   bias[k] = 1; 
-  // }
-
-  // Print the generated data to verify
-
-  // for (int i = 0; i < H; i++) {
-  //   for (int j = 0; j < W; j++) {
-  //     std::cout << input[i * W + j] << " ";
-  //   }
-  //   std::cout << std::endl;
-  // }
-
-  const auto inferenceStart = std::chrono::high_resolution_clock::now();
-
-  /// Execute forward inference of the model.
-  _mlir_ciface_forward(&result, &input);
-  
-  const auto inferenceEnd = std::chrono::high_resolution_clock::now();
-  const std::chrono::duration<double, std::milli> inferenceTime =
-        inferenceEnd - inferenceStart;
-
-  /// Print the output data for verification.
-  std::cout << "\033[33;1m[Output] \033[0m";
-  std::cout << "[";
-  for (int i = 0; i < H_out; i++) {
-    if (i > 0) std::cout << " ";
-    std::cout << "[";
-    for (int j = 0; j < W_out; j++) {
-      if (j > 0) std::cout << " ";
-      std::cout << result[i * W_out + j];
-    }
-    std::cout << "]";
-    if (i < H_out - 1) std::cout << "\n ";
-  }
-  std::cout << "]" << std::endl;
-
-  /// Print the performance.
-  std::cout << "\033[33;1m[Time] \033[0m";
-  std::cout << inferenceTime.count() << " ms"
-            << std::endl;
-
-  return 0;
-}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c4449a0a81..3aa1195d10 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -16,10 +16,6 @@ if (BUDDY_LENET_EXAMPLES)
   add_subdirectory(BuddyLeNet)
 endif()
 
-if (BUDDY_TEST_EXAMPLES)
-  add_subdirectory(BuddyTest)
-endif()
-
 if(BUDDY_WHISPER_EXAMPLES)
   add_subdirectory(BuddyWhisper)
 endif()
diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
index 4d613473a0..9d8c80f014 100644
--- a/frontend/Python/frontend.py
+++ b/frontend/Python/frontend.py
@@ -42,7 +42,6 @@
 from .ops.tosa import ops_registry as tosa_ops_registry
 from .ops.math import ops_registry as math_ops_registry
 from .ops.func import ops_registry as func_ops_registry
-from .ops.gpu import ops_registry as gpu_ops_registry
 from .graph import Graph, TensorDType, TensorMeta
 from .graph.operation import *
 from .graph.transform import maxpool2d_simplify
@@ -99,14 +98,12 @@ def __init__(
         self._verbose = verbose
         self._imported_graphs = []
         self._ops_registry = {}
-        self._ops_gpu_registry = {}
         self._imported_params = {}
         self._ops_registry.update(math_ops_registry)
         self._ops_registry.update(linalg_ops_registry)
         self._ops_registry.update(tosa_ops_registry)
         self._ops_registry.update(func_ops_registry)
         self._ops_registry.update(primary_registry)
-        self._ops_gpu_registry.update(gpu_ops_registry)
         self._ops_map = {
             "output": OutputOp,
             "placeholder": PlaceholderOp,
@@ -286,7 +283,6 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
                 func_inputs,
                 fake_params,
                 self._ops_registry,
-                self._ops_gpu_registry,
                 self._func_name,
                 self._verbose
             )
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 6a18f8b80d..c7239a0d7d 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -107,7 +107,6 @@ def __init__(
         inputs: List[TensorMeta],
         fake_params: List[TensorMeta],
         ops_registry: dict,
-        ops_gpu_registry: dict,
         func_name: str,
         device: DeviceType = DeviceType.CPU,
         verbose=False
@@ -133,7 +132,6 @@ def __init__(
         self._imported_module = None
         self._verbose = verbose
         self._ops_registry = ops_registry
-        self._ops_gpu_registry = ops_gpu_registry
         self._func_name = func_name
         self._ctx = ir.Context()
         self._output_memref = None
@@ -182,7 +180,7 @@ def init_op_group(self):
         #         continue
         #     group = [op]
         #     subgraph_name = "subgraph{}".format(i)
-        #     self.group_map_device[subgraph_name] = DeviceType.GPU
+        #     self.group_map_device[subgraph_name] = DeviceType.CPU
         #     self.op_groups[subgraph_name] = group
         group = []
         for i, op in enumerate(self._body):
@@ -258,7 +256,6 @@ def lower_to_top_level_ir(self):
                 self._inputs,
                 self._func_name,
                 self._ops_registry,
-                self._ops_gpu_registry,
                 False,
                 self.device,
                 verbose=self._verbose
@@ -455,7 +452,6 @@ def __init__(
         inputs: List[TensorMeta],
         func_name: str,
         ops_registry: dict,
-        ops_gpu_registry: dict,
         do_param_pack: bool = False,
         device: DeviceType = DeviceType.CPU,
         verbose=False
@@ -483,7 +479,6 @@ def __init__(
         self._num_input_visited = 0
         self._module = ir.Module.create()
         self._ops_registry = ops_registry
-        self._ops_gpu_registry = ops_gpu_registry
         self._current_param_pack_offset = None
 
     def _str_to_mlir_dtype(self, dtype: str) -> ir.Type:
@@ -577,11 +572,11 @@ def generated_func(*args):
                             self._symbol_table.get((str(output_arg), 0))
                             for output_arg in output_node_args
                         ]
-                        if self._device == DeviceType.GPU:
-                            returns = [
-                                buffer.to_tensor(ret)
-                                for ret in returns
-                            ]
+                        # if self._device == DeviceType.GPU:
+                        #     returns = [
+                        #         buffer.to_tensor(ret)
+                        #         for ret in returns
+                        #     ]
                         self._symbol_table[("output", 0)] = returns
                     elif isinstance(node, PlaceholderOp):
                         self._import_placeholder(node, args_list)
@@ -609,8 +604,8 @@ def generated_func(*args):
                         
                 return self._symbol_table.get(("output", 0))
         
-        if self._device == DeviceType.GPU:
-            self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get()
+        # if self._device == DeviceType.GPU:
+        #     self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get()
 
         return self._module
 
@@ -712,14 +707,14 @@ def _import_placeholder(
             placeholder_name = args_list[self._num_input_visited]
 
         # TODO : Consider converting arg type from RankedTensorType to MemRefType
-        if self._device == DeviceType.GPU:
-            placeholder_name = buffer.to_memref(
-                ir.MemRefType.get(
-                    list(node.tensor_meta.shape), 
-                    self._str_to_mlir_dtype(node.tensor_meta.dtype)
-                ),
-                placeholder_name
-            )
+        # if self._device == DeviceType.GPU:
+        #     placeholder_name = buffer.to_memref(
+        #         ir.MemRefType.get(
+        #             list(node.tensor_meta.shape), 
+        #             self._str_to_mlir_dtype(node.tensor_meta.dtype)
+        #         ),
+        #         placeholder_name
+        #     )
 
         self._symbol_table[(str(node.name), 0)] = placeholder_name
         self._num_input_visited += 1
@@ -734,14 +729,14 @@ def _import_op(self, node: Op):
         """
         
         op_name = node.__class__.__name__
-        if self._device == DeviceType.CPU:
-            op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
-                self._ops_registry[op_name](node, self._symbol_table)
-            )
-        else:
-            op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
-                self._ops_gpu_registry[op_name](node, self._symbol_table)
-            )
+        # if self._device == DeviceType.CPU:
+        op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
+            self._ops_registry[op_name](node, self._symbol_table)
+        )
+        # else:
+        #     op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
+        #         self._ops_gpu_registry[op_name](node, self._symbol_table)
+        #     )
         if isinstance(op_ret, tuple | List):
             for i, operation in enumerate(op_ret):
                 if isinstance(operation, ir.Operation) or isinstance(
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index 356eb0922b..58e7766cb1 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -153,7 +153,6 @@ def build_subgraph_by_group(self):
                 subgraph_input,
                 [], 
                 self._graph._ops_registry, 
-                self._graph._ops_gpu_registry,
                 subgraph_name,
                 subgraph_device, 
                 verbose=self._graph._verbose
@@ -217,7 +216,6 @@ def construct_main_graph(self, do_param_pack=False):
             self._graph._inputs,
             self._graph._fake_params,
             self._graph._ops_registry,
-            self._graph._ops_gpu_registry,
             self._graph._func_name,
             self._graph._verbose
         )
@@ -298,7 +296,6 @@ def construct_main_graph(self, do_param_pack=False):
                 main_graph._inputs,
                 main_graph._func_name,
                 main_graph._ops_registry,
-                main_graph._ops_gpu_registry,
                 do_param_pack,
             )
             return main_importer.import_main_graph()
diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py
index d8bac5c77a..cfa825b0aa 100644
--- a/frontend/Python/graph/json_decoder.py
+++ b/frontend/Python/graph/json_decoder.py
@@ -10,7 +10,6 @@
 from ..ops.tosa import ops_registry as tosa_ops_registry
 from ..ops.math import ops_registry as math_ops_registry
 from ..ops.func import ops_registry as func_ops_registry
-from ..ops.gpu import ops_registry as gpu_ops_registry
 
 def json_to_graph(json_str):
     """
@@ -59,7 +58,6 @@ def json_to_tensormeta(json_data):
         inputs, 
         params,
         ops_registry, 
-        gpu_ops_registry,
         graph_name
     )
     graph.device = _graph['device']
diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py
deleted file mode 100644
index 9c8a5265e3..0000000000
--- a/frontend/Python/ops/gpu.py
+++ /dev/null
@@ -1,729 +0,0 @@
-# ===- gpu.py -----------------------------------------------------------------
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ===---------------------------------------------------------------------------
-#
-# The registry of mappings from Buddy node to MLIR GPU kernel.
-#
-# ===---------------------------------------------------------------------------
-
-
-from typing import Tuple
-import mlir.ir as ir
-from mlir.dialects import gpu, memref, arith, scf, vector
-
-from ..graph import TensorDType
-from ..graph import (
-    ReluOp,
-    ReshapeOp,
-    PermuteOp,
-    Conv2dOp,
-    MaxPool2dOp,
-    AddMMOp
-)
-from .utils import *
-
-TILE_WIDTH = 16
-
-def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
-    """
-    Import the buddy ReluOp.
-    From Buddy ReluOp to MLIR Relu GPU kernel.
-    """
-    assert len(node.args) == 1
-    input = symbol_table.get((str(node.args[0]), 0))
-    if input is None:
-        return
-    output_shape = list(node.tensor_meta["shape"])
-    dtype = node.tensor_meta["dtype"]
-    element_type = mlir_element_type_get(dtype)
-
-    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
-    kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
-
-    # Flatten the input into a one-dimensional format 
-    output_size = tensor_shape_size(output_shape)
-    size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size))
-    shape = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
-    memref.StoreOp(size, shape, [c0])
-    memref_reshape_type = ir.MemRefType.get([output_size], element_type)
-    input_reshape = memref.ReshapeOp(memref_reshape_type, input, shape)
-
-    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    input_cast = memref.CastOp(unranked_memref_type, input)
-    gpu.HostRegisterOp(input_cast)
-    gpu_kernel = gpu.LaunchOp(
-        asyncToken=None,
-        asyncDependencies=[],
-        gridSizeX=c1.result, 
-        gridSizeY=c1.result, 
-        gridSizeZ=c1.result,
-        blockSizeX=kernels.result, 
-        blockSizeY=c1.result, 
-        blockSizeZ=c1.result,
-    )
-    gpu_kernel_block = ir.Block.create_at_start(
-        gpu_kernel.body,
-        [
-            ir.IndexType.get(),     # block_id x
-            ir.IndexType.get(),     # block_id y 
-            ir.IndexType.get(),     # block_id z 
-            ir.IndexType.get(),     # thread_id x
-            ir.IndexType.get(),     # thread_id y  
-            ir.IndexType.get(),     # thread_id z
-            ir.IndexType.get(),     # grid_size x
-            ir.IndexType.get(),     # grid_size y
-            ir.IndexType.get(),     # grid_size z
-            ir.IndexType.get(),     # block_size x
-            ir.IndexType.get(),     # block_size y
-            ir.IndexType.get(),     # block_size z
-        ]
-    )
-
-    with ir.InsertionPoint(gpu_kernel_block):
-        thread_local_idx = gpu_kernel_block.arguments[3]
-        element_attr = mlir_element_attr_get(dtype, 0.0)
-        cst_0 = arith.ConstantOp(element_type, element_attr)
-        loop = scf.ForOp(
-            lower_bound=thread_local_idx,
-            upper_bound=size,
-            step=gpu_kernel.blockSizeX
-        )
-        with ir.InsertionPoint(loop.body):
-            load = memref.LoadOp(input_reshape, [loop.induction_variable])
-            result = arith.MaxNumFOp(load, cst_0)
-            memref.StoreOp(result, input_reshape, [loop.induction_variable])
-            scf.YieldOp([])
-        
-        gpu.TerminatorOp()
-
-    gpu.HostUnregisterOp(input_cast)
-    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-    memref.CopyOp(input, output)
-    return output
-
-
-# TODO: Implement Reshape Operation on GPU in future revisions.
-def reshape_op(node: ReshapeOp, symbol_table):
-    """
-    Import the reshape operation.
-    From buddy graph ir's `ReshapeOp` operator to MLIR Memref `reshape`
-    operation.
-
-    Note: If the new shape contains one and only one `-1`, the size of the new
-    shape will be inferred automatically.
-    """
-    input1 = symbol_table.get((str(node.args[0]), 0))
-    new_shape = []
-    for i in node.args[1]:
-        new_shape.append(i)
-    output_shape = list(node.tensor_meta["shape"])
-    total_size = tensor_shape_size(output_shape)
-
-    neg_one_cnt = 0
-    rest_size = 1
-    for dim_siz in new_shape:
-        if dim_siz == -1:
-            neg_one_cnt += 1
-            continue
-        rest_size *= dim_siz
-
-    if neg_one_cnt != 0:
-        if neg_one_cnt > 1 or total_size % rest_size != 0:
-            raise ValueError("Can not infer the new shape!")
-        infer_dim_size = total_size // rest_size
-        for i, _ in enumerate(new_shape):
-            if new_shape[i] == -1:
-                new_shape[i] = infer_dim_size
-
-    shape = memref.AllocOp(ir.MemRefType.get([len(new_shape)], ir.IndexType.get()), [], [])
-    for i, _ in enumerate(new_shape):
-        c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))
-        size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), new_shape[i]))
-        memref.StoreOp(size, shape, [c])
-
-    dtype = node.tensor_meta["dtype"]
-    element_type = mlir_element_type_get(dtype)
-    output_type = ir.MemRefType.get(new_shape, element_type)
-    op = memref.ReshapeOp(output_type, input1, shape)
-
-    return op
-
-
-# TODO: Implement Permute Operation on GPU in future revisions.
-def permute_op(node: PermuteOp, symbol_table):
-    """
-    Import the permute operation.
-    From buddy graph ir's `PermuteOp` operator to MLIR Memref `transpose`
-    operation.
-    """
-    input1 = symbol_table.get((str(node.args[0]), 0))
-    perm_map = node.args[1]
-    perm_map_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm_map))
-
-    output_shape = list(node.tensor_meta["shape"])
-    dtype = node.tensor_meta["dtype"]
-    
-    element_type = mlir_element_type_get(dtype)
-    element_attr = mlir_element_attr_get(dtype, 0.0)
-    
-    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    f0 = arith.ConstantOp(element_type, element_attr)
-
-    v0 = vector.transfer_read(
-        vector=ir.VectorType.get(output_shape, element_type),
-        source=input1,
-        indices=[c0]*len(output_shape),
-        permutation_map=perm_map_attr,
-        padding=f0
-    )
-    
-    transpose = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-
-    vector.transfer_write(
-        result=None,
-        vector=v0,
-        source=transpose,
-        indices=[c0]*len(output_shape),
-        permutation_map=ir.AffineMapAttr.get(
-            ir.AffineMap.get_permutation([i for i in range(len(output_shape))])
-        )
-    )
-    return transpose
-
-
-# TODO: Consider the cases where the arguments take different values.
-def convolution2d_op(node: Conv2dOp, symbol_table):
-    """
-    Import the convolution operation.
-    From Buddy Conv2dOp to MLIR GPU `conv2d` kernel.
-    arg[0]: Tensor input
-    arg[1]: Tensor weight
-    arg[2]: Tensor? bias
-    arg[3]: SymInt[] stride
-    arg[4]: SymInt[] padding
-    arg[5]: SymInt[] dilation
-    arg[6]: bool transposed
-    arg[7]: SymInt[] output_padding
-    arg[8]: SymInt groups
-    """
-    # Get arguments from convolution node.
-    assert len(node.args) == 9
-    input = node.args[0]
-    filter = node.args[1]
-    bias = node.args[2]
-    stride = node.args[3]
-    input_padding = node.args[4]
-    dilation = node.args[5]
-    is_kernel_transposed = node.args[6]
-    out_padding = node.args[7]
-    groups = node.args[8]
-
-    # TODO: Consider the cases where the variables take different values.
-    assert input_padding[0] == input_padding[1] == 0
-    assert dilation[0] == dilation[1] == 1
-    assert is_kernel_transposed == False
-    assert out_padding[0] == out_padding[1] == 0
-    assert groups == 1
-
-    # Prepare input, filter, and output information.
-    input_val = symbol_table.get((str(input), 0))
-    input_shape = list(ir.MemRefType(input_val.type).shape)
-    filter_val = symbol_table.get((str(filter), 0))
-    filter_shape = ir.MemRefType(filter_val.type).shape
-    bias_val = symbol_table.get((str(bias), 0))
-    dtype = node.tensor_meta["dtype"]
-    element_type = mlir_element_type_get(dtype)
-    output_shape = list(node.tensor_meta["shape"])
-
-    batch_size = input_shape[0]
-    in_channels = input_shape[1]
-    out_channels = output_shape[1]
-    in_size_h = input_shape[2]
-    in_size_w = input_shape[3]
-    out_size_h = output_shape[2]
-    out_size_w = output_shape[3]
-    H_filter = filter_shape[2]
-    W_filter = filter_shape[3]
-
-    output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    input_cast = memref.CastOp(unranked_memref_type, input_val)
-    filter_cast = memref.CastOp(unranked_memref_type, filter_val)
-    bias_cast = memref.CastOp(unranked_memref_type, bias_val)
-    output_cast = memref.CastOp(unranked_memref_type, output_val)
-
-    gpu.HostRegisterOp(input_cast)
-    gpu.HostRegisterOp(filter_cast)
-    gpu.HostRegisterOp(bias_cast)
-    gpu.HostRegisterOp(output_cast)
-
-    # Tile the input_val into Grids
-    block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH)
-    batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
-    in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
-    out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels))
-    block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z))
-    tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH))
-    H_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_filter))
-    W_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_filter))
-    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
-    
-    # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1)        numBlocks(N, K, block_z)
-    
-    gpu_kernel = gpu.LaunchOp(
-        asyncToken=None,
-        asyncDependencies=[],
-        gridSizeX=batch_size_val.result,
-        gridSizeY=out_channels_val.result,
-        gridSizeZ=block_z_val.result,
-        blockSizeX=tile_width_val.result,
-        blockSizeY=tile_width_val.result,
-        blockSizeZ=c1.result,
-    )
-
-    gpu_kernel_block = ir.Block.create_at_start(
-        gpu_kernel.body,
-        [
-            ir.IndexType.get(),     # block_id x
-            ir.IndexType.get(),     # block_id y 
-            ir.IndexType.get(),     # block_id z 
-            ir.IndexType.get(),     # thread_id x
-            ir.IndexType.get(),     # thread_id y  
-            ir.IndexType.get(),     # thread_id z
-            ir.IndexType.get(),     # grid_size x
-            ir.IndexType.get(),     # grid_size y
-            ir.IndexType.get(),     # grid_size z
-            ir.IndexType.get(),     # block_size x
-            ir.IndexType.get(),     # block_size y
-            ir.IndexType.get(),     # block_size z
-        ]
-    )
-
-    with ir.InsertionPoint(gpu_kernel_block):
-        batch_id = gpu_kernel_block.arguments[0]    
-        out_channel_id = gpu_kernel_block.arguments[1]    
-        tile_id = gpu_kernel_block.arguments[2] 
-        thread_local_idx = gpu_kernel_block.arguments[3]  
-        thread_local_idy = gpu_kernel_block.arguments[4]
-
-        # Calculate the convolution element at (h, w) for this thread
-        tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH
-        tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
-        
-        t0 = arith.divui(tile_id, tile_num_val)
-        t1 = arith.muli(t0, tile_width_val)
-        thread_global_idx = arith.addi(t1, thread_local_idx)
-
-        t2 = arith.remui(tile_id, tile_num_val)
-        t3 = arith.muli(t2, tile_width_val)
-        thread_global_idy = arith.addi(t3, thread_local_idy)
-
-        stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0]))
-        stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1]))
-        t4 = arith.muli(thread_global_idx, stride_h)
-        t5 = arith.muli(thread_global_idy, stride_w)
-
-        # Check if the (h, w) is out of the output bounds
-        ult = 6
-        out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h))
-        out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w))
-        isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val)
-        isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val)
-        isInBounds = arith.andi(isHInBounds, isWInBounds)
-        
-        cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0))
-        branch0 = scf.IfOp(isInBounds)
-        with ir.InsertionPoint(branch0.then_block):
-            loop0 = scf.ForOp(
-                lower_bound=c0.result,
-                upper_bound=in_channels_val.result,
-                step=c1.result,
-                iter_args=[cst_0.result]
-            )
-            with ir.InsertionPoint(loop0.body):
-                loop1 = scf.ForOp(
-                    lower_bound=c0.result,
-                    upper_bound=H_filter_val.result,
-                    step=c1.result,
-                    iter_args=[cst_0.result]
-                )
-                with ir.InsertionPoint(loop1.body):
-                    loop2 = scf.ForOp(
-                        lower_bound=c0.result,
-                        upper_bound=W_filter_val.result,
-                        step=c1.result,
-                        iter_args=[cst_0.result]
-                    )
-                    with ir.InsertionPoint(loop2.body):
-                        # TODO : loop body
-                        in_channel_id = loop0.body.arguments[0]
-                        filter_ele_idx = loop1.body.arguments[0]
-                        filter_ele_idy = loop2.body.arguments[0]
-                        input_ele_idx = arith.addi(t4, filter_ele_idx)
-                        input_ele_idy = arith.addi(t5, filter_ele_idy)
-                        input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy])
-                        filter_ele = memref.LoadOp(filter_val, [out_channel_id, in_channel_id, filter_ele_idx, filter_ele_idy])
-                        t6 = arith.mulf(input_ele, filter_ele)
-                        iter_arg2 = loop2.body.arguments[1]
-                        iter_res2 = arith.addf(iter_arg2, t6)
-                        scf.YieldOp([iter_res2])
-
-                    iter_arg1 = loop1.body.arguments[1]
-                    iter_res1 = arith.addf(loop2, iter_arg1)
-                    scf.YieldOp([iter_res1])
-
-                iter_arg0 = loop0.body.arguments[1]
-                iter_res0 = arith.addf(loop1, iter_arg0)
-                scf.YieldOp([iter_res0])
-
-            # Add bias data for any out_channel.
-            bias_ele = memref.LoadOp(bias_val, [out_channel_id])
-            result = arith.addf(loop0, bias_ele)
-            memref.StoreOp(result, output_val, [batch_id, out_channel_id, thread_global_idx, thread_global_idy])
-            scf.YieldOp([])
-                
-        gpu.TerminatorOp()
-    
-    gpu.HostUnregisterOp(input_cast)
-    gpu.HostUnregisterOp(filter_cast)
-    gpu.HostUnregisterOp(bias_cast)
-    gpu.HostUnregisterOp(output_cast)
-
-    return output_val
-
-
-# TODO: Consider the cases where the maxpool2d operation needs padding.
-def maxpool2d_op(node: MaxPool2dOp, symbol_table):
-    """
-    Import the maxpool2d operation.
-    From Buddy MaxPool2dOp to MLIR GPU `max_pool2d` kernel.
-    """
-    if len(node.args) == 5:
-        raise NotImplementedError
-    input1 = node.args[0]
-    kernel = node.args[1]
-    stride = node.args[2]
-
-    # Prepare padding data
-    if len(node.args) > 3:
-        pad = node.args[3]
-    else:
-        pad = [0 for _ in kernel]
-
-    dtype = node.tensor_meta["dtype"]
-    element_type = mlir_element_type_get(dtype)
-    output_shape = node.tensor_meta["shape"]
-
-    batch_size = output_shape[0]
-    in_channels = output_shape[1]
-    out_size_h = output_shape[2]
-    out_size_w = output_shape[3]
-
-    input_val = symbol_table.get((str(input1), 0))
-    output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    input_cast = memref.CastOp(unranked_memref_type, input_val)
-    output_cast = memref.CastOp(unranked_memref_type, output_val)
-
-    gpu.HostRegisterOp(input_cast)
-    gpu.HostRegisterOp(output_cast)
-
-    # Tile the input_val into Grids
-    block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH)
-    batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size))
-    in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels))
-    block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z))
-    tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH))
-    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
-    
-    # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1)        numBlocks(N, K, block_z)
-    
-    gpu_kernel = gpu.LaunchOp(
-        asyncToken=None,
-        asyncDependencies=[],
-        gridSizeX=batch_size_val.result,
-        gridSizeY=in_channels_val.result,
-        gridSizeZ=block_z_val.result,
-        blockSizeX=tile_width_val.result,
-        blockSizeY=tile_width_val.result,
-        blockSizeZ=c1.result,
-    )
-
-    gpu_kernel_block = ir.Block.create_at_start(
-        gpu_kernel.body,
-        [
-            ir.IndexType.get(),     # block_id x
-            ir.IndexType.get(),     # block_id y 
-            ir.IndexType.get(),     # block_id z 
-            ir.IndexType.get(),     # thread_id x
-            ir.IndexType.get(),     # thread_id y  
-            ir.IndexType.get(),     # thread_id z
-            ir.IndexType.get(),     # grid_size x
-            ir.IndexType.get(),     # grid_size y
-            ir.IndexType.get(),     # grid_size z
-            ir.IndexType.get(),     # block_size x
-            ir.IndexType.get(),     # block_size y
-            ir.IndexType.get(),     # block_size z
-        ]
-    )
-
-    with ir.InsertionPoint(gpu_kernel_block):
-        batch_id = gpu_kernel_block.arguments[0]    
-        in_channel_id = gpu_kernel_block.arguments[1]    
-        tile_id = gpu_kernel_block.arguments[2] 
-        thread_local_idx = gpu_kernel_block.arguments[3]  
-        thread_local_idy = gpu_kernel_block.arguments[4]
-
-        # Calculate the convolution element at (h, w) for this thread
-        tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH
-        tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num))
-        
-        t0 = arith.divui(tile_id, tile_num_val)
-        t1 = arith.muli(t0, tile_width_val)
-        thread_global_idx = arith.addi(t1, thread_local_idx)
-
-        t2 = arith.remui(tile_id, tile_num_val)
-        t3 = arith.muli(t2, tile_width_val)
-        thread_global_idy = arith.addi(t3, thread_local_idy)
-
-        kernel_size_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0]))
-        kernel_size_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1]))
-        stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0]))
-        stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1]))
-        init_ele_idx = arith.muli(thread_global_idx, stride_h)
-        init_ele_idy = arith.muli(thread_global_idy, stride_w)
-
-        # Check if the (h, w) is out of the output bounds
-        ult = 6
-        out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h))
-        out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w))
-        isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val)
-        isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val)
-        isInBounds = arith.andi(isHInBounds, isWInBounds)
-        
-        branch0 = scf.IfOp(isInBounds)
-        with ir.InsertionPoint(branch0.then_block):
-            first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, init_ele_idx, init_ele_idy])
-            loop0 = scf.ForOp(
-                lower_bound=c0.result,
-                upper_bound=kernel_size_h.result,
-                step=c1.result,
-                iter_args=[first_ele.result]
-            )
-            with ir.InsertionPoint(loop0.body):
-                loop1 = scf.ForOp(
-                    lower_bound=c0.result,
-                    upper_bound=kernel_size_w.result,
-                    step=c1.result,
-                    iter_args=[first_ele.result]
-                )
-                with ir.InsertionPoint(loop1.body):
-                    # TODO : loop body
-                    kernel_ele_idx = loop0.body.arguments[0]
-                    kernel_ele_idy = loop1.body.arguments[0]
-                    input_ele_idx = arith.addi(init_ele_idx, kernel_ele_idx)
-                    input_ele_idy = arith.addi(init_ele_idy, kernel_ele_idy)
-                    input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy])
-                    iter_arg1 = loop1.body.arguments[1]
-                    iter_res1 = arith.maxnumf(iter_arg1, input_ele)
-                    scf.YieldOp([iter_res1])
-
-                iter_arg0 = loop0.body.arguments[1]
-                iter_res0 = arith.maxnumf(loop1, iter_arg0)
-                scf.YieldOp([iter_res0])
-
-            memref.StoreOp(loop0, output_val, [batch_id, in_channel_id, thread_global_idx, thread_global_idy])
-            scf.YieldOp([])
-                
-        gpu.TerminatorOp()
-
-    gpu.HostUnregisterOp(input_cast)
-    gpu.HostUnregisterOp(output_cast)
-
-    return output_val
-
-
-def addmm_op(
-    node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation]
-):
-    dtype = node.tensor_meta["dtype"]
-    element_type = mlir_element_type_get(dtype)
-    c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1))
-    kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512))
-
-    # TODO: Reverse the order of the mat2 before multiplication to optimize the cache hit rate
-
-    input_data = symbol_table.get((str(node.args[1]), 0), node.args[1])
-    weight = symbol_table.get((str(node.args[2]), 0), node.args[2])
-    bias = symbol_table.get((str(node.args[0]), 0), node.args[0])
-    # print("input_data: "+str(input_data))
-    # print("weight: "+str(weight))
-    # print("bias: "+str(bias))
-
-    # TODO: Transpose of the mat2 before multiplication to optimize the cache hit rate
-
-    output_shape = list(node.tensor_meta["shape"])
-    input_shape = input_data.type.shape
-    weight_shape = weight.type.shape
-    # print("output_shape: "+str(output_shape))
-    # print("output_shape: "+str())
-    # print("input_shape: "+str(input_shape))
-    # print("weight_shape: "+str(weight_shape))
-    # print("bias shape: "+str(bias.type.shape))
-
-    # Flatten the input into a one-dimensional format 
-    input_size = tensor_shape_size(input_shape)
-    weight_size = tensor_shape_size(weight_shape)
-    output_size = tensor_shape_size(output_shape)
-    # print("input_size: "+str(input_size))
-    # print("weight_size: "+str(weight_size))
-    # print("output_size: "+str(output_size))
-
-    input_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_size))
-    weight_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_size))
-    output_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size))
-    # print("input_size_c: "+str(input_size_c))
-    # print("weight_size_c: "+str(weight_size_c))
-    # print("output_size_c: "+str(output_size_c))
-
-    input_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
-    weight_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
-    bias_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], [])
-    # print("input_shape_1d: "+str(input_shape_1d))
-    # print("weight_shape_1d: "+str(weight_shape_1d))
-    # print("bias_shape_1d: "+str(bias_shape_1d))
-
-    memref.StoreOp(input_size_c, input_shape_1d, [c0])
-    memref.StoreOp(weight_size_c, weight_shape_1d, [c0])
-    memref.StoreOp(output_size_c, bias_shape_1d, [c0])
-
-    input_reshape_type = ir.MemRefType.get([input_size], element_type)
-    weight_reshape_type = ir.MemRefType.get([weight_size], element_type)
-    bias_reshape_type = ir.MemRefType.get([output_size], element_type)
-    output_type = ir.MemRefType.get(output_shape, element_type)
-    # print("input_reshape_type: "+str(input_reshape_type))
-    # print("weight_reshape_type: "+str(weight_reshape_type))
-    # print("bias_reshape_type: "+str(bias_reshape_type))
-    # print("output_type: "+str(output_type))
-
-    input_reshape_1d = memref.ReshapeOp(input_reshape_type, input_data, input_shape_1d)
-    weight_reshape_1d = memref.ReshapeOp(weight_reshape_type, weight, weight_shape_1d)
-    bias_reshape_1d = memref.ReshapeOp(bias_reshape_type, bias, bias_shape_1d)
-    # print("input_reshape: "+str(input_reshape_1d))
-    # print("weight_reshape: "+str(weight_reshape_1d))
-    # print("bias_reshape: "+str(bias_reshape_1d))
-
-
-    unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0))
-    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, input_reshape_1d))
-    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, weight_reshape_1d))
-    gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, bias_reshape_1d))
-
-    row = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[0]))
-    col = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_shape[1]))
-    inner_dim = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[1]))
-
-    gpu_kernel = gpu.LaunchOp(
-        asyncToken=None,
-        asyncDependencies=[],
-        gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result,
-        blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result,
-    )
-    gpu_kernel_block = ir.Block.create_at_start(
-        gpu_kernel.body,
-        [
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_idx, block_idy, block_idz
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # thread_idx , thread_idy, thread_idz
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # grid_size x, grid_size y, grid_size z
-            ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(),     # block_size x, block_size y, block_size z
-        ]
-    )
-
-    # TODO: optimize to one dimension
-    with ir.InsertionPoint(gpu_kernel_block):
-        tIdX = gpu_kernel_block.arguments[3]
-        tIdY = gpu_kernel_block.arguments[4]
-        otter_loop = scf.ForOp(
-            lower_bound=tIdX,
-            upper_bound=row,
-            step=gpu_kernel.blockSizeX
-        )
-        with ir.InsertionPoint(otter_loop.body):
-            inner_loop = scf.ForOp(
-                lower_bound=tIdY,
-                upper_bound=col,
-                step=gpu_kernel.blockSizeY
-            )
-            with ir.InsertionPoint(inner_loop.body):
-                initial_sum = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0))
-
-                mul_loop = scf.ForOp(
-                    lower_bound=c0.result,
-                    upper_bound=inner_dim,
-                    step=c1.result,
-                    iter_args=[initial_sum]
-                )
-                with ir.InsertionPoint(mul_loop.body):
-                    sum = mul_loop.inner_iter_args[0]
-                    mat1_load = memref.LoadOp(input_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, inner_dim).result, mul_loop.induction_variable)])
-                    mat2_load = memref.LoadOp(weight_reshape_1d, [arith.AddIOp(arith.MulIOp(mul_loop.induction_variable, col).result, inner_loop.induction_variable)])
-                    res = arith.MulFOp(mat1_load, mat2_load)
-                    res = arith.AddFOp(sum, res)
-                    scf.YieldOp([res])
-                
-                sum = mul_loop.result
-                bias_load = memref.LoadOp(bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)])
-                res = arith.AddFOp(sum, bias_load)
-                memref.StoreOp(res, bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)])
-                scf.YieldOp([])
-            scf.YieldOp([])
-        gpu.TerminatorOp()
-
-
-    output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], [])
-
-    # FIXME: Dialect `memref' not found for custom op 'memref.expand_shape' 
-    # axis = ir.ArrayAttr.get(
-    #     [
-    #         ir.IntegerAttr.get(ir.IntegerType.get_signless(64), i)
-    #         for i in range(len(output_shape))
-    #     ],
-    #     None,
-    # )
-    # axis = ir.ArrayAttr.get([axis], None)
-    # bias_reshape = memref.ExpandShapeOp(output_type, bias, axis)
-
-    bias_shape = memref.AllocOp(ir.MemRefType.get([len(output_shape)], ir.IndexType.get()), [], [])
-    # print("bias_shape: "+str(bias_shape))
-    for i in range(len(output_shape)):
-        memref.StoreOp(arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_shape[i])), bias_shape, [arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))])
-
-    bias_reshape = memref.ReshapeOp(output_type, bias, bias_shape)
-    memref.CopyOp(bias_reshape, output)
-    return output
-
-
-ops_registry = {
-    "ReluOp": relu_op,
-    "ViewOp": reshape_op,
-    "PermuteOp": permute_op,
-    "Conv2dOp": convolution2d_op,
-    "MaxPool2dOp": maxpool2d_op,
-    "AddMMOp": addmm_op
-}
diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
index dd50feccf8..f616127930 100644
--- a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
+++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
@@ -18,11 +18,9 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/TypeRange.h"
@@ -30,9 +28,7 @@
 #include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include <mlir/Dialect/Affine/IR/AffineOps.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <mlir/Dialect/Linalg/Transforms/Transforms.h>
@@ -42,11 +38,8 @@
 #include <mlir/IR/Value.h>
 #include <mlir/Pass/Pass.h>
 
-#include <map>
-#include <set>
-#include <sstream>
-#include <string>
-#include <utility>
+#include <vector>
+
 using namespace mlir;
 using namespace vector;
 
@@ -82,6 +75,9 @@ class ConvertMemcpyToGPUPass
 void ConvertMemcpyToGPUPass::runOnOperation() {
   auto funcOp = getOperation();
 
+  if (funcOp.isDeclaration() || funcOp.isExternal())
+    return;
+
   // Make sure the gpu function is already outlined.
   funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
     if (auto gpuLaunchOp = dyn_cast<gpu::LaunchOp>(nestedOp)) {
@@ -90,8 +86,9 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
     return WalkResult::advance();
   });
 
-  std::set<gpu::AllocOp *> unDeallocatedOperations;
+  std::vector<Value> unDeallocatedValue;
   OpBuilder builder(funcOp->getContext());
+
   // Copy all function arguments to gpu, needs deallocation
   if (processArgs) {
     builder.setInsertionPointToStart(&(funcOp.getBody().front()));
@@ -103,23 +100,11 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       auto memrefType = dyn_cast<MemRefType>(arg.getType());
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedOperations.insert(&gpuAllocOp);
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           gpuAllocOp.getLoc(), TypeRange(), ValueRange(),
           gpuAllocOp.getResult(0), arg);
-      // Replace all users with GPU memory
-      auto users = arg.getUsers();
-      std::vector<Operation *> usersVec(users.begin(), users.end());
-      for (auto user : usersVec) {
-        // Don't replace memcpy's operand
-        if (isa<gpu::MemcpyOp>(user))
-          continue;
-        for (size_t j = 0; j < user->getNumOperands(); j++) {
-          if (user->getOperand(j) == arg) {
-            user->setOperand(j, gpuAllocOp.getResult(0));
-          }
-        }
-      }
+      arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp);
     }
   }
 
@@ -149,19 +134,18 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
 
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           allocOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-      auto users = result.getUsers();
-      std::vector<Operation *> usersVec(users.begin(), users.end());
-      for (auto user : usersVec) {
-        for (size_t j = 0; j < user->getNumOperands(); j++) {
-          // Only the return value will not have dealloc op
-          if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
-            builder.setInsertionPointAfter(deallocOp);
-            auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-                deallocOp->getLoc(), TypeRange(), ValueRange(),
-                gpuAllocOp.getResult(0));
-            deallocOp->erase();
-          } else if (user->getOperand(j) == result) {
-            user->setOperand(j, gpuAllocOp.getResult(0));
+
+      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
+        if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
+          builder.setInsertionPointAfter(deallocOp);
+          builder.create<gpu::DeallocOp>(deallocOp->getLoc(), TypeRange(),
+                                         ValueRange(), gpuAllocOp.getResult(0));
+          deallocOp->erase();
+        } else {
+          for (auto &opOperand : user->getOpOperands()) {
+            if (opOperand.is(result)) {
+              opOperand.set(gpuAllocOp.getResult(0));
+            }
           }
         }
       }
@@ -175,28 +159,8 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       builder.setInsertionPointAfter(copyOp);
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           copyOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      {
-        auto users = src.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, gpuMemcpyOp.getOperand(1));
-            }
-          }
-        }
-      }
-      {
-        auto users = dst.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, gpuMemcpyOp.getOperand(0));
-            }
-          }
-        }
-      }
+      src.replaceAllUsesWith(gpuMemcpyOp->getResult(1));
+      dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0));
       copyOp->erase();
     }
     // Allocate space on GPU and copy global memrefs to GPU, needs deallocation
@@ -206,47 +170,34 @@ void ConvertMemcpyToGPUPass::runOnOperation() {
       auto memrefType = dyn_cast<MemRefType>(result.getType());
       auto gpuAllocOp = builder.create<gpu::AllocOp>(
           getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedOperations.insert(&gpuAllocOp);
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
+
       auto src = result;
       auto dst = gpuAllocOp->getResult(0);
       auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
           gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      {
-        auto users = src.getUsers();
-        std::vector<Operation *> usersVec(users.begin(), users.end());
-        for (auto user : usersVec) {
-          if (isa<gpu::MemcpyOp>(user))
-            continue;
-          // TODO: replace with src.replaceAllUsesExcept()
-          for (size_t j = 0; j < user->getNumOperands(); j++) {
-            if (user->getOperand(j) == src) {
-              user->setOperand(j, dst);
-            }
-          }
-        }
-      }
+      src.replaceAllUsesExcept(dst, gpuMemcpyOp);
     }
     // Copy data back to CPU, deallocate GPU, then return
     else if (auto returnOp = dyn_cast<func::ReturnOp>(nestedOp)) {
       builder.setInsertionPoint(returnOp);
-
-      for (auto *gpuAllocOp : unDeallocatedOperations) {
-        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-            builder.getUnknownLoc(), TypeRange(), ValueRange(),
-            gpuAllocOp->getResult(0));
-      }
-      builder.setInsertionPoint(returnOp);
       for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) {
         auto val = returnOp->getOperand(i);
-        auto memRefType = dyn_cast<MemRefType>(val.getType());
-        auto allocOp = builder.create<memref::AllocOp>(builder.getUnknownLoc(),
-                                                       memRefType);
-        auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
-            allocOp.getLoc(), TypeRange(), ValueRange(), allocOp->getResult(0),
-            val);
-        auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-            gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
-        returnOp->setOperand(i, allocOp->getResult(0));
+        if (auto memrefType = dyn_cast<MemRefType>(val.getType())) {
+          auto allocOp =
+              builder.create<memref::AllocOp>(returnOp->getLoc(), memrefType);
+          builder.create<gpu::MemcpyOp>(allocOp.getLoc(), TypeRange(),
+                                        ValueRange(), allocOp->getResult(0),
+                                        val);
+          // FIXME: may be leak memory
+          // auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+          //     gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
+          returnOp->setOperand(i, allocOp->getResult(0));
+        }
+      }
+      for (auto value : unDeallocatedValue) {
+        builder.create<gpu::DeallocOp>(returnOp->getLoc(), TypeRange(),
+                                       ValueRange(), value);
       }
     }
     return WalkResult::advance();
@@ -260,4 +211,4 @@ void registerConvertMemcpyToGPUPass() {
   PassRegistration<ConvertMemcpyToGPUPass>();
 }
 } // namespace buddy
-} // namespace mlir
+} // namespace mlir
\ No newline at end of file
diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir
index 63edfd8d02..f616127930 100644
--- a/tests/Conversion/convert-memcpy-to-gpu.mlir
+++ b/tests/Conversion/convert-memcpy-to-gpu.mlir
@@ -1,23 +1,214 @@
-// RUN: buddy-opt -convert-memcpy-to-gpu -canonicalize %s | FileCheck %s
-
-// CHECK: %memref = gpu.alloc  () : memref<32x32xf32>
-// CHECK: %memref_0 = gpu.alloc  () : memref<32x32xf32>
-// CHECK: gpu.dealloc  %memref : memref<32x32xf32>
-// CHECK: %alloc = memref.alloc() : memref<32x32xf32>
-// CHECK: gpu.memcpy  %alloc, %memref_0 : memref<32x32xf32>, memref<32x32xf32>
-// CHECK: gpu.dealloc  %memref_0 : memref<32x32xf32>
-module attributes {gpu.container_module} {
-  func.func @matmul(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) -> memref<32x32xf32> {
-    %c2 = arith.constant 2 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32x32xf32>
-    gpu.launch_func  @matmul_kernel::@matmul_kernel blocks in (%c1, %c1, %c1) threads in (%c64, %c2, %c1)  
-    return %alloc : memref<32x32xf32>
+//===- ConvertMemcpyToGPU.cpp ---------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that converts memcpy to gpu operations.
+//
+//===---------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+
+#include <vector>
+
+using namespace mlir;
+using namespace vector;
+
+//===----------------------------------------------------------------------===//
+// ConvertMemcpyToGPUPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ConvertMemcpyToGPUPass
+    : public PassWrapper<ConvertMemcpyToGPUPass, OperationPass<func::FuncOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass)
+  StringRef getArgument() const final { return "convert-memcpy-to-gpu"; }
+  StringRef getDescription() const final {
+    return "Convert memref opertaions to gpu operations.";
+  }
+  ConvertMemcpyToGPUPass() = default;
+  ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass &) {}
+
+  Option<bool> processArgs{
+      *this, "process-args",
+      llvm::cl::desc("Whether the pass processes the input args."),
+      llvm::cl::init(true)};
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<gpu::GPUDialect, memref::MemRefDialect>();
   }
-  gpu.module @matmul_kernel {
-    gpu.func @matmul_kernel() kernel attributes {gpu.known_block_size = array<i32: 64, 2, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
-      gpu.return
+};
+
+void ConvertMemcpyToGPUPass::runOnOperation() {
+  auto funcOp = getOperation();
+
+  if (funcOp.isDeclaration() || funcOp.isExternal())
+    return;
+
+  // Make sure the gpu function is already outlined.
+  funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+    if (auto gpuLaunchOp = dyn_cast<gpu::LaunchOp>(nestedOp)) {
+      nestedOp->emitOpError("The gpu function should be outlined.");
+    }
+    return WalkResult::advance();
+  });
+
+  std::vector<Value> unDeallocatedValue;
+  OpBuilder builder(funcOp->getContext());
+
+  // Copy all function arguments to gpu, needs deallocation
+  if (processArgs) {
+    builder.setInsertionPointToStart(&(funcOp.getBody().front()));
+    unsigned numArgs = funcOp.getNumArguments();
+    for (unsigned i = 0; i < numArgs; ++i) {
+      BlockArgument arg = funcOp.getArgument(i);
+      // Create a gpu.alloc op, then copy memory to it
+      // TODO: Move this out of operation, make the copy process async
+      auto memrefType = dyn_cast<MemRefType>(arg.getType());
+      auto gpuAllocOp = builder.create<gpu::AllocOp>(
+          builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({}));
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
+      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+          gpuAllocOp.getLoc(), TypeRange(), ValueRange(),
+          gpuAllocOp.getResult(0), arg);
+      arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp);
     }
   }
+
+  funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+    // Replace all allocations with GPU.alloc
+    if (auto allocOp = dyn_cast<memref::AllocOp>(nestedOp)) {
+      // Rewrite this allocOp to gpu.alloc, change for all users
+      builder.setInsertionPointAfter(allocOp);
+      auto result = allocOp->getResult(0);
+      auto memrefType = dyn_cast<MemRefType>(result.getType());
+      auto memorySpace = memrefType.getMemorySpace();
+
+      // Filter operations.
+      if (memorySpace) {
+        if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace)) {
+          if (intMemorySpace.getInt() != 0) {
+            return WalkResult::advance();
+          }
+        } else if (auto gpuMemorySpace =
+                       llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) {
+          if (gpuMemorySpace.getValue() != gpu::AddressSpace::Global) {
+            return WalkResult::advance();
+          }
+        } else
+          return WalkResult::advance();
+      }
+
+      auto gpuAllocOp = builder.create<gpu::AllocOp>(
+          allocOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
+
+      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
+        if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
+          builder.setInsertionPointAfter(deallocOp);
+          builder.create<gpu::DeallocOp>(deallocOp->getLoc(), TypeRange(),
+                                         ValueRange(), gpuAllocOp.getResult(0));
+          deallocOp->erase();
+        } else {
+          for (auto &opOperand : user->getOpOperands()) {
+            if (opOperand.is(result)) {
+              opOperand.set(gpuAllocOp.getResult(0));
+            }
+          }
+        }
+      }
+      allocOp->erase();
+    }
+    // Replace all memory.copy operations with gpu.memcpy
+    else if (auto copyOp = dyn_cast<memref::CopyOp>(nestedOp)) {
+      auto src = copyOp.getOperand(0);
+      auto dst = copyOp.getOperand(1);
+      // Notice: GPU.memcpy has a different src dst order
+      builder.setInsertionPointAfter(copyOp);
+      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+          copyOp->getLoc(), TypeRange(), ValueRange(), dst, src);
+      src.replaceAllUsesWith(gpuMemcpyOp->getResult(1));
+      dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0));
+      copyOp->erase();
+    }
+    // Allocate space on GPU and copy global memrefs to GPU, needs deallocation
+    else if (auto getGlobalOp = dyn_cast<memref::GetGlobalOp>(nestedOp)) {
+      builder.setInsertionPointAfter(getGlobalOp);
+      auto result = getGlobalOp->getResult(0);
+      auto memrefType = dyn_cast<MemRefType>(result.getType());
+      auto gpuAllocOp = builder.create<gpu::AllocOp>(
+          getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
+      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
+
+      auto src = result;
+      auto dst = gpuAllocOp->getResult(0);
+      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
+          gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src);
+      src.replaceAllUsesExcept(dst, gpuMemcpyOp);
+    }
+    // Copy data back to CPU, deallocate GPU, then return
+    else if (auto returnOp = dyn_cast<func::ReturnOp>(nestedOp)) {
+      builder.setInsertionPoint(returnOp);
+      for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) {
+        auto val = returnOp->getOperand(i);
+        if (auto memrefType = dyn_cast<MemRefType>(val.getType())) {
+          auto allocOp =
+              builder.create<memref::AllocOp>(returnOp->getLoc(), memrefType);
+          builder.create<gpu::MemcpyOp>(allocOp.getLoc(), TypeRange(),
+                                        ValueRange(), allocOp->getResult(0),
+                                        val);
+          // FIXME: may be leak memory
+          // auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
+          //     gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
+          returnOp->setOperand(i, allocOp->getResult(0));
+        }
+      }
+      for (auto value : unDeallocatedValue) {
+        builder.create<gpu::DeallocOp>(returnOp->getLoc(), TypeRange(),
+                                       ValueRange(), value);
+      }
+    }
+    return WalkResult::advance();
+  });
+}
+} // end anonymous namespace.
+
+namespace mlir {
+namespace buddy {
+void registerConvertMemcpyToGPUPass() {
+  PassRegistration<ConvertMemcpyToGPUPass>();
 }
+} // namespace buddy
+} // namespace mlir
\ No newline at end of file

From 53d69d61797c402b52474765a90f62d98bd04bd1 Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Tue, 29 Oct 2024 06:42:10 +0000
Subject: [PATCH 24/29] CPU, GPU, Custom

---
 examples/BuddyLeNet/CMakeLists.txt            |  78 ++++++----
 examples/BuddyLeNet/buddy-lenet-import.py     |  10 +-
 frontend/Python/graph/graph.py                | 142 +++++++-----------
 frontend/Python/graph/graph_driver.py         |  49 +++---
 frontend/Python/graph/operation.py            |   2 +-
 frontend/Python/graph/transform/__init__.py   |   2 +-
 frontend/Python/graph/transform/fuse_ops.py   |  52 ++++++-
 .../graph/transform/useless_op_eliminate.py   |   2 +-
 frontend/Python/ops/tosa.py                   |   1 -
 frontend/Python/ops/utils.py                  |  13 --
 10 files changed, 191 insertions(+), 160 deletions(-)

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 6e9cfe1204..5935ad50c5 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -49,8 +49,54 @@ add_custom_command(
   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
   COMMENT "Building subgraph0.o"
   VERBATIM)
-  
-# new
+
+set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
+set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
+# add_custom_command(
+#   OUTPUT subgraph0.o
+#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
+#             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+#           -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+#           -buffer-deallocation
+#           -convert-linalg-to-parallel-loops
+#           -canonicalize
+#           -gpu-map-parallel-loops
+#           -convert-parallel-loops-to-gpu
+#           -gpu-kernel-outlining
+#           -canonicalize
+#           -cse |
+#           ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+#           ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+#           ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+#   COMMENT "Building subgraph0.o"
+#   VERBATIM)
+
+add_custom_command(
+  OUTPUT subgraph1.o
+  COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
+            -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
+          -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
+          -buffer-deallocation
+          -convert-linalg-to-parallel-loops
+          -canonicalize
+          -gpu-map-parallel-loops
+          -convert-parallel-loops-to-gpu
+          -gpu-kernel-outlining
+          -canonicalize
+          -cse |
+          ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
+          ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+          ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+          ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
+  DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
+  COMMENT "Building subgraph1.o"
+  VERBATIM)
 set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
 set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
 add_custom_command(
@@ -75,34 +121,6 @@ add_custom_command(
   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
   COMMENT "Building subgraph1.o"
   VERBATIM)
-
-# add_library(LENET_GPU STATIC subgraph0_gpu.o forward.o)
-
-# SET_TARGET_PROPERTIES(LENET_GPU PROPERTIES LINKER_LANGUAGE C)
-
-# add_executable(buddy-lenet-run-gpu buddy-lenet-main.cpp)
-# target_link_directories(buddy-lenet-run-gpu PRIVATE ${LLVM_LIBRARY_DIR})
-
-# set(BUDDY_LENET_LIBS_GPU LENET_GPU mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime ${PNG_LIBRARIES})
-
-# target_link_libraries(buddy-lenet-run-gpu ${BUDDY_LENET_LIBS_GPU})
-
-# add_custom_command(
-#   OUTPUT subgraph1.ll
-#   COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata  -one-shot-bufferize |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt 
-#             -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3},  gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
-#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir
-#   COMMENT "Building subgraph1.ll"
-#   VERBATIM)
-  
-# add_custom_command(
-#   OUTPUT subgraph1.o
-#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o
-#   DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll
-#   COMMENT "Building subgraph1.o"
-#   VERBATIM)
 
 add_library(LENET STATIC subgraph0.o subgraph1.o forward.o)
 
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index c878b3b163..4acd548038 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -27,7 +27,7 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse
+from buddy.compiler.graph.transform import cpu_fuse, gpu_fuse, custom_partition
 from buddy.compiler.graph.type import DeviceType
 from buddy.compiler.ops import tosa, gpu
 from buddy.compiler.graph.json_decoder import json_to_graph
@@ -58,7 +58,7 @@
 assert len(graphs) == 1
 graph = graphs[0]
 params = dynamo_compiler.imported_params[graph]
-pattern_list = [simply_fuse]
+pattern_list = [custom_partition]
 graph.fuse_ops(pattern_list)
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 
@@ -71,10 +71,10 @@
 graph0 = json_to_graph(json_str)
 driver = GraphDriver(graph0)
 driver.subgraphs[0].lower_to_top_level_ir()
-driver.subgraphs[1].lower_to_top_level_ir()
-
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
+# Add heterogeneous hardware partition
+driver.subgraphs[1].lower_to_top_level_ir()
 with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file:
     print(driver.subgraphs[1]._imported_module, file=module_file)
 with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file:
@@ -103,4 +103,4 @@
 # # Convert the lenet graph to DOT string
 # dot_str = graph.to_dot()
 # with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file:
-#     module_file.write(dot_str)
\ No newline at end of file
+#     module_file.write(dot_str)
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index c7239a0d7d..5ddbbe8328 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -109,7 +109,7 @@ def __init__(
         ops_registry: dict,
         func_name: str,
         device: DeviceType = DeviceType.CPU,
-        verbose=False
+        verbose=False,
     ) -> None:
         """
         Initializes the Graph.
@@ -175,26 +175,14 @@ def init_op_group(self):
         Returns:
         - None
         """
-        # for i, op in enumerate(self._body):
-        #     if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp):
-        #         continue
-        #     group = [op]
-        #     subgraph_name = "subgraph{}".format(i)
-        #     self.group_map_device[subgraph_name] = DeviceType.CPU
-        #     self.op_groups[subgraph_name] = group
         group = []
         for i, op in enumerate(self._body):
-            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25:
+            if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp):
                 continue
-            group.append(op)
-        subgraph_name = "subgraph1"
-        self.group_map_device[subgraph_name] = DeviceType.CPU
-        self.op_groups[subgraph_name] = group
-        
-        new_group = [self._body[25]]
-        subgraph_name = "subgraph0"
-        self.group_map_device[subgraph_name] = DeviceType.GPU
-        self.op_groups[subgraph_name] = new_group
+            group = [op]
+            subgraph_name = "subgraph{}".format(i)
+            self.group_map_device[subgraph_name] = DeviceType.CPU
+            self.op_groups[subgraph_name] = group
 
     def fuse_ops(self, pattern_list: List[FunctionType]):
         """
@@ -214,9 +202,9 @@ def fuse_ops(self, pattern_list: List[FunctionType]):
         # Initialize operation groups
         self.init_op_group()
 
-        # # Apply fusion patterns
-        # for pattern_func in pattern_list:
-        #     pattern_func(self)
+        # Apply fusion patterns
+        for pattern_func in pattern_list:
+            pattern_func(self)
 
     def perform(self, func_list: List[FunctionType]):
         """
@@ -258,7 +246,7 @@ def lower_to_top_level_ir(self):
                 self._ops_registry,
                 False,
                 self.device,
-                verbose=self._verbose
+                verbose=self._verbose,
             )
             self._imported_module = fx_importer.import_graph()
             outputs = fx_importer.get_output_nodes()
@@ -353,7 +341,7 @@ def to_dot(self):
         Returns:
             str: A DOT string representing the buddy graph for visualization.
         """
-        dot = graphviz.Digraph(comment='Buddy Graph')
+        dot = graphviz.Digraph(comment="Buddy Graph")
         for op in self._body:
             # if isinstance(op, PlaceholderOp):
             #     continue
@@ -361,14 +349,23 @@ def to_dot(self):
                 dot.edge(op._name, child)
         for op in self._body:
             if isinstance(op, PlaceholderOp):
-                dot.node(op._name, shape="ellipse", fillcolor="white", style="filled")
+                dot.node(
+                    op._name, shape="ellipse", fillcolor="white", style="filled"
+                )
                 # continue
             elif isinstance(op, OutputOp):
-                dot.node(op._name, shape="ellipse", fillcolor="white", style="filled")
+                dot.node(
+                    op._name, shape="ellipse", fillcolor="white", style="filled"
+                )
             elif isinstance(op, MaxPool2dOp):
                 dot.node(op._name, shape="box", fillcolor="red", style="filled")
             else:
-                dot.node(op._name, shape="box", fillcolor="deepskyblue", style="filled")
+                dot.node(
+                    op._name,
+                    shape="box",
+                    fillcolor="deepskyblue",
+                    style="filled",
+                )
         return str(dot)
 
     def to_json(self):
@@ -380,7 +377,7 @@ def to_json(self):
         """
         json_str = json.dumps(self, cls=BuddyGraphEncoder)
         return json_str
-    
+
 
 class BuddyGraphEncoder(json.JSONEncoder):
     """
@@ -392,36 +389,36 @@ class BuddyGraphEncoder(json.JSONEncoder):
     Returns:
         JSONEncoder: A JSON encoder instance for Buddy Graph objects.
     """
+
     def default(self, obj):
         if isinstance(obj, Graph):
             node_map_device = {}
             for subgraph_name, ops in obj.op_groups.items():
                 for op in ops:
-                    node_map_device[op.name] = obj.group_map_device[subgraph_name]
+                    node_map_device[op.name] = obj.group_map_device[
+                        subgraph_name
+                    ]
             return {
-                'graph_name' : obj._func_name,
-                'nodes' : obj._body,
-                'device' : obj.device,
-                'params' : obj._fake_params,
-                'inputs' : obj._inputs,
-                'node_map_device' : node_map_device
+                "graph_name": obj._func_name,
+                "nodes": obj._body,
+                "device": obj.device,
+                "params": obj._fake_params,
+                "inputs": obj._inputs,
+                "node_map_device": node_map_device,
             }
         elif isinstance(obj, Op):
             return {
-                'name' : obj._name,
-                'children' : obj._children,
-                'parents' : obj._parents,
-                'arguments' : obj._arguments,
-                'keyword_arguments' : obj._keyword_arguments,
-                'tensor_meta' : obj._tensor_meta,
-                'type' : obj._op_type,
-                'class' : obj.__class__.__name__
+                "name": obj._name,
+                "children": obj._children,
+                "parents": obj._parents,
+                "arguments": obj._arguments,
+                "keyword_arguments": obj._keyword_arguments,
+                "tensor_meta": obj._tensor_meta,
+                "type": obj._op_type,
+                "class": obj.__class__.__name__,
             }
         elif isinstance(obj, TensorMeta):
-            return {
-                'shape' : obj.shape,
-                'dtype' : obj.dtype
-            }
+            return {"shape": obj.shape, "dtype": obj.dtype}
         elif isinstance(obj, OpType):
             return obj._name_
         elif isinstance(obj, TensorDType):
@@ -431,6 +428,7 @@ def default(self, obj):
         else:
             return super().default(obj)
 
+
 class GraphImporter:
     """
     Imports an buddy graph and generates an MLIR module in high-level dialects.
@@ -454,7 +452,7 @@ def __init__(
         ops_registry: dict,
         do_param_pack: bool = False,
         device: DeviceType = DeviceType.CPU,
-        verbose=False
+        verbose=False,
     ):
         """
         Initializes the buddy Graph importer.
@@ -572,40 +570,32 @@ def generated_func(*args):
                             self._symbol_table.get((str(output_arg), 0))
                             for output_arg in output_node_args
                         ]
-                        # if self._device == DeviceType.GPU:
-                        #     returns = [
-                        #         buffer.to_tensor(ret)
-                        #         for ret in returns
-                        #     ]
                         self._symbol_table[("output", 0)] = returns
                     elif isinstance(node, PlaceholderOp):
                         self._import_placeholder(node, args_list)
                     elif isinstance(node, GetItemOp):
-                        self._symbol_table[
-                            (str(node.name), 0)
-                        ] = self._symbol_table[
-                            (str(node.args[0]), node.args[1])
-                        ]
+                        self._symbol_table[(str(node.name), 0)] = (
+                            self._symbol_table[
+                                (str(node.args[0]), node.args[1])
+                            ]
+                        )
                     else:
                         self._import_op(node)
                     new_ops = [op for op in func_op.body.blocks[0].operations]
                     if self._verbose:
-                        print('='*20 + "Graph Node" + "="*20)
+                        print("=" * 20 + "Graph Node" + "=" * 20)
                         print("Node: " + node.name)
                         print("Type: " + str(node._op_type))
                         print("Arguments: " + str(node.args))
                         print("Parents: " + str(node._parents))
                         print("Children: " + str(node._children))
-                        print('-'*20 + "MLIR OPS" + '-'*20)
+                        print("-" * 20 + "MLIR OPS" + "-" * 20)
                         for op in new_ops:
                             if op not in old_ops:
                                 print(op)
                         print("")
-                        
+
                 return self._symbol_table.get(("output", 0))
-        
-        # if self._device == DeviceType.GPU:
-        #     self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get()
 
         return self._module
 
@@ -653,11 +643,11 @@ def generated_func(*args):
                     elif isinstance(node, PlaceholderOp):
                         self._import_placeholder(node, args_list)
                     elif isinstance(node, GetItemOp):
-                        self._symbol_table[
-                            (str(node.name), 0)
-                        ] = self._symbol_table[
-                            (str(node.args[0]), node.args[1])
-                        ]
+                        self._symbol_table[(str(node.name), 0)] = (
+                            self._symbol_table[
+                                (str(node.args[0]), node.args[1])
+                            ]
+                        )
                     else:
                         self._import_op(node)
 
@@ -706,16 +696,6 @@ def _import_placeholder(
         else:
             placeholder_name = args_list[self._num_input_visited]
 
-        # TODO : Consider converting arg type from RankedTensorType to MemRefType
-        # if self._device == DeviceType.GPU:
-        #     placeholder_name = buffer.to_memref(
-        #         ir.MemRefType.get(
-        #             list(node.tensor_meta.shape), 
-        #             self._str_to_mlir_dtype(node.tensor_meta.dtype)
-        #         ),
-        #         placeholder_name
-        #     )
-
         self._symbol_table[(str(node.name), 0)] = placeholder_name
         self._num_input_visited += 1
 
@@ -727,16 +707,10 @@ def _import_op(self, node: Op):
             node (Op): The buddy node representing the operation.
 
         """
-        
         op_name = node.__class__.__name__
-        # if self._device == DeviceType.CPU:
         op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
             self._ops_registry[op_name](node, self._symbol_table)
         )
-        # else:
-        #     op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = (
-        #         self._ops_gpu_registry[op_name](node, self._symbol_table)
-        #     )
         if isinstance(op_ret, tuple | List):
             for i, operation in enumerate(op_ret):
                 if isinstance(operation, ir.Operation) or isinstance(
diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py
index 58e7766cb1..013a9f6e0b 100644
--- a/frontend/Python/graph/graph_driver.py
+++ b/frontend/Python/graph/graph_driver.py
@@ -41,6 +41,7 @@ class GraphDriver:
     - _subgraphs_outputs (dict): A dictionary mapping subgraph names to their
     output op's result.
     """
+
     def __init__(self, graph: Graph) -> None:
         """
         Initialize the GraphDriver object with a given computational graph.
@@ -53,9 +54,9 @@ def __init__(self, graph: Graph) -> None:
         - None
         """
         self._graph = graph
-        self._subgraph_dependencies = { 
-            subgraph_name : set() 
-            for subgraph_name in list(self._graph.op_groups.keys()) 
+        self._subgraph_dependencies = {
+            subgraph_name: set()
+            for subgraph_name in list(self._graph.op_groups.keys())
         }
         self._call_table = {}
         (
@@ -100,7 +101,7 @@ def build_subgraph_by_group(self):
             if isinstance(node, OutputOp):
                 for arg in node.args:
                     output_node.append(arg)
-        
+
         # Identify outputs for each subgraph and build dependencies between subgraphs
         for subgraph_name in self._graph.op_groups.keys():
             subgraphs_outputs[subgraph_name] = []
@@ -135,11 +136,11 @@ def build_subgraph_by_group(self):
                     if inp in node._parents:
                         placeholder_node.add_children(op.name)
                 subgraph_body.append(placeholder_node)
-            
+
             # Add operations to subgraph body
             for op in self._graph.op_groups[subgraph_name]:
                 subgraph_body.append(op)
-            
+
             # Construct output node
             output_node = OutputOp()
             output_node.name = "output"
@@ -151,11 +152,11 @@ def build_subgraph_by_group(self):
             # Create subgraph and add it to the dictionary
             subgraph = Graph(
                 subgraph_input,
-                [], 
-                self._graph._ops_registry, 
+                [],
+                self._graph._ops_registry,
                 subgraph_name,
-                subgraph_device, 
-                verbose=self._graph._verbose
+                subgraph_device,
+                verbose=self._graph._verbose,
             )
             subgraph.body = subgraph_body
             for op in subgraph_body:
@@ -176,12 +177,14 @@ def topological_sort_subgraph(self):
         """
 
         # Calculate in degree of each subgraph
-        in_degree = { subgraph_name : 0 for subgraph_name in list(self._subgraphs.keys()) }
+        in_degree = {
+            subgraph_name: 0 for subgraph_name in list(self._subgraphs.keys())
+        }
         for src, dests in self._subgraph_dependencies.items():
             for dest in dests:
                 in_degree[dest] += 1
 
-        # Topological sorting 
+        # Topological sorting
         queue = deque([node for node in in_degree if in_degree[node] == 0])
         topo_order = []
 
@@ -194,7 +197,11 @@ def topological_sort_subgraph(self):
                     queue.append(child)
 
         # TODO: If the custom subgraph partitioning is illegal, further partition the subgraph to make it valid.
-        return topo_order if len(topo_order) == len(list(self._subgraphs.keys())) else None
+        return (
+            topo_order
+            if len(topo_order) == len(list(self._subgraphs.keys()))
+            else None
+        )
 
     def construct_main_graph(self, do_param_pack=False):
         """
@@ -217,7 +224,7 @@ def construct_main_graph(self, do_param_pack=False):
             self._graph._fake_params,
             self._graph._ops_registry,
             self._graph._func_name,
-            self._graph._verbose
+            self._graph._verbose,
         )
 
         # Adding FuncOp nodes for each subgraph
@@ -235,18 +242,18 @@ def construct_main_graph(self, do_param_pack=False):
                     self._graph.node_table[output].tensor_meta["dtype"]
                 )
             main_graph.add_node(func_node)
-        
+
         # Adding placeholder operations from the original graph
         for op in self._graph.body:
             if isinstance(op, PlaceholderOp):
                 main_graph.add_node(op)
-            
+
         # Analysis topology order to sort subgraph call.
         topo_order = self.topological_sort_subgraph()
-        if topo_order ==  None:
-            print('Error : Graph Partitioning is illegal!')
+        if topo_order == None:
+            print("Error : Graph Partitioning is illegal!")
             return None
-        
+
         # Adding CallOp to invoke the single subgraph
         for i, subgraph_name in enumerate(topo_order):
             call_node = CallOp()
@@ -261,7 +268,7 @@ def construct_main_graph(self, do_param_pack=False):
                     if inp in value:
                         call_node.add_argument(
                             arg=self._call_table[key].name,
-                            arg_index=value.index(inp)
+                            arg_index=value.index(inp),
                         )
                         break
             for output in self._subgraphs_outputs[subgraph_name]:
@@ -283,7 +290,7 @@ def construct_main_graph(self, do_param_pack=False):
             getitem_node.name = "getitem{}".format(i)
             output_node.add_argument(getitem_node.name)
             main_graph.add_node(getitem_node)
-        
+
         # Marking the final output of the main graph
         output_node.name = "output"
         main_graph.add_node(output_node)
diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
index fde8809fd6..0ec7930c25 100644
--- a/frontend/Python/graph/operation.py
+++ b/frontend/Python/graph/operation.py
@@ -126,7 +126,7 @@ def args(self):
     @property
     def kwargs(self):
         return self._keyword_arguments
-    
+
     @property
     def parents(self):
         return self._parents
diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py
index d91e0d06b2..427d266b95 100644
--- a/frontend/Python/graph/transform/__init__.py
+++ b/frontend/Python/graph/transform/__init__.py
@@ -18,5 +18,5 @@
 #
 # ===---------------------------------------------------------------------------
 
-from .fuse_ops import simply_fuse
+from .fuse_ops import cpu_fuse, gpu_fuse, custom_partition
 from .useless_op_eliminate import maxpool2d_simplify
diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py
index feb697930a..e0ff806f52 100644
--- a/frontend/Python/graph/transform/fuse_ops.py
+++ b/frontend/Python/graph/transform/fuse_ops.py
@@ -26,11 +26,33 @@
 # OP_TYPE_FUSABLE = [OpType.BroadcastType, OpType.ElementwiseType, OpType.ReshapeType]
 # OP_TYPE_UNFUSABLE = [OpType.Unfusable, OpType.ConcatType]
 # OP_TYPE_FUSABLE_BY_SPECIFIC_PASS = []
-# ANCHOR_OP_TYPE = [] 
+# ANCHOR_OP_TYPE = []
 
-def simply_fuse(graph: Graph):
+
+def cpu_fuse(graph: Graph):
+    """
+    Function to fuse all operations into one graph. Set the device type to CPU.
+
+    Args:
+    - graph (Graph): The input graph to be simplified.
+
+    Returns:
+    - None: Modifies the input graph in place.
+    """
+    new_op_group = []
+    device = DeviceType.CPU
+    for op in graph.body:
+        if isinstance(op, PlaceholderOp):
+            continue
+        new_op_group.append(op)
+    graph.op_groups = {}
+    graph.op_groups["subgraph0"] = new_op_group
+    graph.group_map_device = {"subgraph0": device}
+
+
+def gpu_fuse(graph: Graph):
     """
-    Function to fuse all operations into one graph.
+    Function to fuse all operations into one graph. Set the device type to GPU.
 
     Args:
     - graph (Graph): The input graph to be simplified.
@@ -47,3 +69,27 @@ def simply_fuse(graph: Graph):
     graph.op_groups = {}
     graph.op_groups["subgraph0"] = new_op_group
     graph.group_map_device = {"subgraph0": device}
+
+
+def custom_partition(graph: Graph):
+    """
+    Function to custom subgraph partition.
+
+    Args:
+    - graph (Graph): The input graph to be simplified.
+
+    Returns:
+    - None: Modifies the input graph in place.
+    """
+    group = []
+    for i, op in enumerate(graph._body):
+        if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25:
+            continue
+        group.append(op)
+        subgraph_name = "subgraph1"
+        graph.group_map_device[subgraph_name] = DeviceType.CPU
+        graph.op_groups[subgraph_name] = group
+    new_group = [graph._body[25]]
+    subgraph_name = "subgraph0"
+    graph.group_map_device[subgraph_name] = DeviceType.GPU
+    graph.op_groups[subgraph_name] = new_group
diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py
index 2522e17984..0d176be2df 100644
--- a/frontend/Python/graph/transform/useless_op_eliminate.py
+++ b/frontend/Python/graph/transform/useless_op_eliminate.py
@@ -74,4 +74,4 @@ def maxpool2d_simplify(graph: Graph):
                 for j, op in enumerate(graph.body):
                     if op == getitem_node:
                         graph.body[j] = new_node
-                        break
\ No newline at end of file
+                        break
diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py
index 3597810e4a..797fdfd6d2 100644
--- a/frontend/Python/ops/tosa.py
+++ b/frontend/Python/ops/tosa.py
@@ -1254,7 +1254,6 @@ def convolution2d_op(node: Conv2dOp, symbol_table):
     return op
 
 
-
 def relu_op(node: ReluOp, symbol_table):
     """
     Import the tensor relu operation.
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
index 012340d475..dad07bd68c 100644
--- a/frontend/Python/ops/utils.py
+++ b/frontend/Python/ops/utils.py
@@ -53,16 +53,3 @@ def mlir_element_attr_get(type_name, value):
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value)
         case TensorDType.Bool:
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
-
-
-def tensor_shape_size(shape):
-    """
-    Calculate the product of all dimensions in the given shape list, 
-    which represents the size of the tensor.
-    Args:
-        shape: A list containing the sizes of each dimension of the tensor.
-    """
-    size = 1
-    for dim in shape:
-        size *= dim
-    return size

From 2488faf07bfdb13bc2c0131caff8447146a6aee5 Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Tue, 29 Oct 2024 07:26:34 +0000
Subject: [PATCH 25/29] temp

---
 .gitignore                                  |   2 -
 examples/BuddyLeNet/buddy-lenet-import.py   |  15 --
 frontend/Python/graph/graph.py              |   3 -
 frontend/Python/ops/utils.py                |   1 +
 tests/Conversion/convert-memcpy-to-gpu.mlir | 275 +++++---------------
 5 files changed, 66 insertions(+), 230 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8fcf8e4b1e..1ffba60cbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,5 @@
 # Clangd cache
 .cache
 
-# environment bash
-env.sh
 # Clangd configurations
 .clangd
\ No newline at end of file
diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index 4acd548038..aec4e5e561 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -89,18 +89,3 @@
 
 float32_param.tofile(Path(current_path) / "arg0.data")
 
-# # Convert the lenet graph to JSON string
-# json_str = graph.to_json()
-# with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file:
-#     module_file.write(json_str)
-
-# # Convert the lenet graph Json string to a lenet graph
-# graph0 = json_to_graph(json_str)
-# graph0.lower_to_top_level_ir()
-# with open(os.path.join(path_prefix, "lenet.mlir"), "w") as module_file:
-#     print(graph0._imported_module, file=module_file)
-
-# # Convert the lenet graph to DOT string
-# dot_str = graph.to_dot()
-# with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file:
-#     module_file.write(dot_str)
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 5ddbbe8328..3283beacee 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -343,8 +343,6 @@ def to_dot(self):
         """
         dot = graphviz.Digraph(comment="Buddy Graph")
         for op in self._body:
-            # if isinstance(op, PlaceholderOp):
-            #     continue
             for child in op._children:
                 dot.edge(op._name, child)
         for op in self._body:
@@ -352,7 +350,6 @@ def to_dot(self):
                 dot.node(
                     op._name, shape="ellipse", fillcolor="white", style="filled"
                 )
-                # continue
             elif isinstance(op, OutputOp):
                 dot.node(
                     op._name, shape="ellipse", fillcolor="white", style="filled"
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
index dad07bd68c..337f5a6b49 100644
--- a/frontend/Python/ops/utils.py
+++ b/frontend/Python/ops/utils.py
@@ -53,3 +53,4 @@ def mlir_element_attr_get(type_name, value):
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value)
         case TensorDType.Bool:
             return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
+
diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir
index f616127930..573000a4b5 100644
--- a/tests/Conversion/convert-memcpy-to-gpu.mlir
+++ b/tests/Conversion/convert-memcpy-to-gpu.mlir
@@ -1,214 +1,69 @@
-//===- ConvertMemcpyToGPU.cpp ---------------------------------------------===//
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the pass that converts memcpy to gpu operations.
-//
-//===---------------------------------------------------------------------===//
-
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Support/LLVM.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <mlir/Dialect/Affine/IR/AffineOps.h>
-#include <mlir/Dialect/Func/IR/FuncOps.h>
-#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/IR/Value.h>
-#include <mlir/Pass/Pass.h>
-
-#include <vector>
-
-using namespace mlir;
-using namespace vector;
-
-//===----------------------------------------------------------------------===//
-// ConvertMemcpyToGPUPass
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class ConvertMemcpyToGPUPass
-    : public PassWrapper<ConvertMemcpyToGPUPass, OperationPass<func::FuncOp>> {
-public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass)
-  StringRef getArgument() const final { return "convert-memcpy-to-gpu"; }
-  StringRef getDescription() const final {
-    return "Convert memref opertaions to gpu operations.";
+// RUN: buddy-opt -convert-memcpy-to-gpu="process-args=1" %s | FileCheck %s
+
+#map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+module attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_1x10x10xf32 : memref<1x10x10xf32> = dense<1.000000e+00> {alignment = 64 : i64}
+  func.func @matmul(%arg0: memref<1x10x10xf32>, %arg1: memref<1x10x10xf32>) -> memref<1x10x10xf32> {
+    // CHECK: %[[d_arg0:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[d_arg0]], %arg0 : memref<1x10x10xf32>, memref<1x10x10xf32>
+    // CHECK: %[[d_arg1:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[d_arg1:.*]], %arg1 : memref<1x10x10xf32>, memref<1x10x10xf32>
+    %c10 = arith.constant 10 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    // CHECK: %[[h_global_data:.*]] = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32>
+    // CHECK: %[[d_global_data:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    // CHECK: gpu.memcpy  %[[d_global_data]], %[[h_global_data]] : memref<1x10x10xf32>, memref<1x10x10xf32>
+    %0 = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32>
+    // CHECK: %[[d_alloc0:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_arg0]]
+    // CHECK-SAME: %[[d_arg1]]
+    // CHECK-SAME: %[[d_alloc0]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<1x10x10xf32>, %arg1 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: %[[d_alloc1:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc_0 : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_global_data]]
+    // CHECK-SAME: %[[d_alloc0]]
+    // CHECK-SAME: %[[d_alloc1]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %0 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: %[[d_result:.*]] = gpu.alloc  () : memref<1x10x10xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32>
+    // CHECK: gpu.launch_func
+    gpu.launch_func  @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %cst : f32, %alloc_1 : memref<1x10x10xf32>)
+    // CHECK: gpu.launch_func
+    // CHECK-SAME: %[[d_alloc0]]
+    // CHECK-SAME: %[[d_alloc1]]
+    // CHECK-SAME: %[[d_result]]
+    gpu.launch_func  @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %alloc_1 : memref<1x10x10xf32>, %c10 : index)
+    // CHECK: gpu.dealloc %[[d_alloc1]] : memref<1x10x10xf32>
+    memref.dealloc %alloc_0 : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_alloc0]] : memref<1x10x10xf32>
+    memref.dealloc %alloc : memref<1x10x10xf32>
+
+    // CHECK: %[[h_alloc:.*]] = memref.alloc() : memref<1x10x10xf32>
+    // CHECK-NEXT: gpu.memcpy  %[[h_alloc]], %[[d_result]] : memref<1x10x10xf32>, memref<1x10x10xf32>
+
+    // CHECK: gpu.dealloc  %[[d_arg0]] : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_arg1]] : memref<1x10x10xf32>
+    // CHECK: gpu.dealloc  %[[d_global_data]] : memref<1x10x10xf32>
+
+    // CHECK: return %[[h_alloc]] : memref<1x10x10xf32>
+    return %alloc_1 : memref<1x10x10xf32>
   }
-  ConvertMemcpyToGPUPass() = default;
-  ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass &) {}
-
-  Option<bool> processArgs{
-      *this, "process-args",
-      llvm::cl::desc("Whether the pass processes the input args."),
-      llvm::cl::init(true)};
-
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<gpu::GPUDialect, memref::MemRefDialect>();
-  }
-};
-
-void ConvertMemcpyToGPUPass::runOnOperation() {
-  auto funcOp = getOperation();
-
-  if (funcOp.isDeclaration() || funcOp.isExternal())
-    return;
-
-  // Make sure the gpu function is already outlined.
-  funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
-    if (auto gpuLaunchOp = dyn_cast<gpu::LaunchOp>(nestedOp)) {
-      nestedOp->emitOpError("The gpu function should be outlined.");
+  gpu.module @kernel {
+    gpu.func @fill(%arg0: index, %arg1: index, %arg2: f32, %arg3: memref<1x10x10xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+      gpu.return
     }
-    return WalkResult::advance();
-  });
-
-  std::vector<Value> unDeallocatedValue;
-  OpBuilder builder(funcOp->getContext());
-
-  // Copy all function arguments to gpu, needs deallocation
-  if (processArgs) {
-    builder.setInsertionPointToStart(&(funcOp.getBody().front()));
-    unsigned numArgs = funcOp.getNumArguments();
-    for (unsigned i = 0; i < numArgs; ++i) {
-      BlockArgument arg = funcOp.getArgument(i);
-      // Create a gpu.alloc op, then copy memory to it
-      // TODO: Move this out of operation, make the copy process async
-      auto memrefType = dyn_cast<MemRefType>(arg.getType());
-      auto gpuAllocOp = builder.create<gpu::AllocOp>(
-          builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
-      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
-          gpuAllocOp.getLoc(), TypeRange(), ValueRange(),
-          gpuAllocOp.getResult(0), arg);
-      arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp);
+    gpu.func @matmul(%arg0: index, %arg1: index, %arg2: memref<1x10x10xf32>, %arg3: memref<1x10x10xf32>, %arg4: memref<1x10x10xf32>, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>} {
+      gpu.return
     }
   }
-
-  funcOp->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
-    // Replace all allocations with GPU.alloc
-    if (auto allocOp = dyn_cast<memref::AllocOp>(nestedOp)) {
-      // Rewrite this allocOp to gpu.alloc, change for all users
-      builder.setInsertionPointAfter(allocOp);
-      auto result = allocOp->getResult(0);
-      auto memrefType = dyn_cast<MemRefType>(result.getType());
-      auto memorySpace = memrefType.getMemorySpace();
-
-      // Filter operations.
-      if (memorySpace) {
-        if (auto intMemorySpace = llvm::dyn_cast<IntegerAttr>(memorySpace)) {
-          if (intMemorySpace.getInt() != 0) {
-            return WalkResult::advance();
-          }
-        } else if (auto gpuMemorySpace =
-                       llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) {
-          if (gpuMemorySpace.getValue() != gpu::AddressSpace::Global) {
-            return WalkResult::advance();
-          }
-        } else
-          return WalkResult::advance();
-      }
-
-      auto gpuAllocOp = builder.create<gpu::AllocOp>(
-          allocOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-
-      for (auto user : llvm::make_early_inc_range(result.getUsers())) {
-        if (auto deallocOp = dyn_cast<memref::DeallocOp>(user)) {
-          builder.setInsertionPointAfter(deallocOp);
-          builder.create<gpu::DeallocOp>(deallocOp->getLoc(), TypeRange(),
-                                         ValueRange(), gpuAllocOp.getResult(0));
-          deallocOp->erase();
-        } else {
-          for (auto &opOperand : user->getOpOperands()) {
-            if (opOperand.is(result)) {
-              opOperand.set(gpuAllocOp.getResult(0));
-            }
-          }
-        }
-      }
-      allocOp->erase();
-    }
-    // Replace all memory.copy operations with gpu.memcpy
-    else if (auto copyOp = dyn_cast<memref::CopyOp>(nestedOp)) {
-      auto src = copyOp.getOperand(0);
-      auto dst = copyOp.getOperand(1);
-      // Notice: GPU.memcpy has a different src dst order
-      builder.setInsertionPointAfter(copyOp);
-      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
-          copyOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      src.replaceAllUsesWith(gpuMemcpyOp->getResult(1));
-      dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0));
-      copyOp->erase();
-    }
-    // Allocate space on GPU and copy global memrefs to GPU, needs deallocation
-    else if (auto getGlobalOp = dyn_cast<memref::GetGlobalOp>(nestedOp)) {
-      builder.setInsertionPointAfter(getGlobalOp);
-      auto result = getGlobalOp->getResult(0);
-      auto memrefType = dyn_cast<MemRefType>(result.getType());
-      auto gpuAllocOp = builder.create<gpu::AllocOp>(
-          getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({}));
-      unDeallocatedValue.push_back(gpuAllocOp->getResult(0));
-
-      auto src = result;
-      auto dst = gpuAllocOp->getResult(0);
-      auto gpuMemcpyOp = builder.create<gpu::MemcpyOp>(
-          gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src);
-      src.replaceAllUsesExcept(dst, gpuMemcpyOp);
-    }
-    // Copy data back to CPU, deallocate GPU, then return
-    else if (auto returnOp = dyn_cast<func::ReturnOp>(nestedOp)) {
-      builder.setInsertionPoint(returnOp);
-      for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) {
-        auto val = returnOp->getOperand(i);
-        if (auto memrefType = dyn_cast<MemRefType>(val.getType())) {
-          auto allocOp =
-              builder.create<memref::AllocOp>(returnOp->getLoc(), memrefType);
-          builder.create<gpu::MemcpyOp>(allocOp.getLoc(), TypeRange(),
-                                        ValueRange(), allocOp->getResult(0),
-                                        val);
-          // FIXME: may be leak memory
-          // auto gpuDeallocOp = builder.create<gpu::DeallocOp>(
-          //     gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val);
-          returnOp->setOperand(i, allocOp->getResult(0));
-        }
-      }
-      for (auto value : unDeallocatedValue) {
-        builder.create<gpu::DeallocOp>(returnOp->getLoc(), TypeRange(),
-                                       ValueRange(), value);
-      }
-    }
-    return WalkResult::advance();
-  });
-}
-} // end anonymous namespace.
-
-namespace mlir {
-namespace buddy {
-void registerConvertMemcpyToGPUPass() {
-  PassRegistration<ConvertMemcpyToGPUPass>();
-}
-} // namespace buddy
-} // namespace mlir
\ No newline at end of file
+}
\ No newline at end of file

From 321927ca88028a77ab92e88a6a8169a3445d89bb Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Wed, 30 Oct 2024 02:45:37 +0000
Subject: [PATCH 26/29] Pass the test

---
 frontend/Python/frontend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
index 9d8c80f014..210815fb1e 100644
--- a/frontend/Python/frontend.py
+++ b/frontend/Python/frontend.py
@@ -45,6 +45,7 @@
 from .graph import Graph, TensorDType, TensorMeta
 from .graph.operation import *
 from .graph.transform import maxpool2d_simplify
+from .graph.type import *
 
 
 class DynamoCompiler:
@@ -284,6 +285,7 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
                 fake_params,
                 self._ops_registry,
                 self._func_name,
+                DeviceType.CPU,
                 self._verbose
             )
             for gm_node in _gm.graph.nodes:

From 5d5a844c7b534429d07dab4c70e2b30dce243179 Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Wed, 30 Oct 2024 03:15:51 +0000
Subject: [PATCH 27/29] correct

---
 examples/BuddyLeNet/CMakeLists.txt            | 23 -----
 frontend/Python/graph/json_decoder.py         | 94 ++++++++++++-------
 .../Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp |  2 +-
 tests/Conversion/convert-memcpy-to-gpu.mlir   |  2 +-
 4 files changed, 64 insertions(+), 57 deletions(-)

diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 5935ad50c5..1902384f92 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -52,29 +52,6 @@ add_custom_command(
 
 set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map")
 set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin")
-# add_custom_command(
-#   OUTPUT subgraph0.o
-#   COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir 
-#             -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-#           -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION}
-#           -buffer-deallocation
-#           -convert-linalg-to-parallel-loops
-#           -canonicalize
-#           -gpu-map-parallel-loops
-#           -convert-parallel-loops-to-gpu
-#           -gpu-kernel-outlining
-#           -canonicalize
-#           -cse |
-#           ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} |
-#           ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-#           ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
-#           ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj  -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
-#   DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-#   COMMENT "Building subgraph0.o"
-#   VERBATIM)
-
 add_custom_command(
   OUTPUT subgraph1.o
   COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir 
diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py
index cfa825b0aa..f3a11440ac 100644
--- a/frontend/Python/graph/json_decoder.py
+++ b/frontend/Python/graph/json_decoder.py
@@ -1,3 +1,22 @@
+# ===- json_decoder.py ---------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This converts the JSON string representing Buddy Graph into a Graph object.
+#
+# ===---------------------------------------------------------------------------
 import json
 from pathlib import Path
 
@@ -11,6 +30,7 @@
 from ..ops.math import ops_registry as math_ops_registry
 from ..ops.func import ops_registry as func_ops_registry
 
+
 def json_to_graph(json_str):
     """
     Converts a buddy graph JSON string to a Graph object.
@@ -21,6 +41,7 @@ def json_to_graph(json_str):
     Returns:
         Graph: The Graph object created from the JSON data.
     """
+
     def json_to_tensormeta(json_data):
         """
         Convert JSON data to a TensorMeta object.
@@ -31,68 +52,77 @@ def json_to_tensormeta(json_data):
         Returns:
             TensorMeta: The TensorMeta object created from the JSON data.
         """
-        if 'shape' in json_data:
-            shape = json_data['shape']
+        if "shape" in json_data:
+            shape = json_data["shape"]
             dtype = next(
-                (member for member in TensorDType.__members__.values() 
-                 if member.value.upper() == json_data['dtype'].upper()), None
+                (
+                    member
+                    for member in TensorDType.__members__.values()
+                    if member.value.upper() == json_data["dtype"].upper()
+                ),
+                None,
             )
             return TensorMeta(shape, dtype)
         return {}
-        
+
     json_data = json.loads(json_str)
     _graph = json_data
-    graph_name = _graph['graph_name'] 
+    graph_name = _graph["graph_name"]
     inputs = []
     params = []
-    for _input in _graph['inputs']:
+    for _input in _graph["inputs"]:
         inputs.append(json_to_tensormeta(_input))
-    for _param in _graph['params']:
+    for _param in _graph["params"]:
         params.append(json_to_tensormeta(_param))
     ops_registry = {}
     ops_registry.update(func_ops_registry)
     ops_registry.update(linalg_ops_registry)
     ops_registry.update(tosa_ops_registry)
     ops_registry.update(math_ops_registry)
-    graph = Graph(
-        inputs, 
-        params,
-        ops_registry, 
-        graph_name
-    )
-    graph.device = _graph['device']
-    for _node in _graph['nodes']:
-        op_class = _node['class']
+    graph = Graph(inputs, params, ops_registry, graph_name)
+    graph.device = _graph["device"]
+    for _node in _graph["nodes"]:
+        op_class = _node["class"]
         op = globals()[op_class]()
 
-        op._name = _node['name']
-        op._children = _node['children']
-        op._parents = _node['parents']
-        op._arguments = _node['arguments']
-        op._keyword_arguments = _node['keyword_arguments']
+        op._name = _node["name"]
+        op._children = _node["children"]
+        op._parents = _node["parents"]
+        op._arguments = _node["arguments"]
+        op._keyword_arguments = _node["keyword_arguments"]
         op._type = next(
-            (member for member in OpType.__members__.values() if member.value == _node['type']), None
+            (
+                member
+                for member in OpType.__members__.values()
+                if member.value == _node["type"]
+            ),
+            None,
         )
 
         # TODO : node attr tensor_meta should be  Class TensorMeta
-        if ('shape' not in _node['tensor_meta']):
-            op._tensor_meta = _node['tensor_meta']
+        if "shape" not in _node["tensor_meta"]:
+            op._tensor_meta = _node["tensor_meta"]
         else:
             op._tensor_meta = {
-                'shape' : _node['tensor_meta']['shape'],
-                'dtype' : next(
-                    (member for member in TensorDType.__members__.values() 
-                    if member.value.upper() == _node['tensor_meta']['dtype'].upper()), None
-                )
+                "shape": _node["tensor_meta"]["shape"],
+                "dtype": next(
+                    (
+                        member
+                        for member in TensorDType.__members__.values()
+                        if member.value.upper()
+                        == _node["tensor_meta"]["dtype"].upper()
+                    ),
+                    None,
+                ),
             }
         graph.add_node(op)
 
-    for i, device in enumerate(list(set(_graph['node_map_device'].values()))):
+    for i, device in enumerate(list(set(_graph["node_map_device"].values()))):
         subgraph_name = "subgraph{}".format(i)
         graph.op_groups[subgraph_name] = []
         graph.group_map_device[subgraph_name] = DeviceType(device)
 
-    for node, op_device in _graph['node_map_device'].items():
+    for node, op_device in _graph["node_map_device"].items():
         op = graph.node_table[node]
         for subgraph_name, group_device in graph.group_map_device.items():
             if op_device == group_device.value:
diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
index f616127930..e44f21cb6e 100644
--- a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
+++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp
@@ -211,4 +211,4 @@ void registerConvertMemcpyToGPUPass() {
   PassRegistration<ConvertMemcpyToGPUPass>();
 }
 } // namespace buddy
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir
diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir
index 573000a4b5..65e9301e4a 100644
--- a/tests/Conversion/convert-memcpy-to-gpu.mlir
+++ b/tests/Conversion/convert-memcpy-to-gpu.mlir
@@ -66,4 +66,4 @@ module attributes {gpu.container_module} {
       gpu.return
     }
   }
-}
\ No newline at end of file
+}

From 69c4262f6dd1fa7ccca5d765e71d7edb1892f35c Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Wed, 30 Oct 2024 03:43:37 +0000
Subject: [PATCH 28/29] temp

---
 examples/BuddyLeNet/buddy-lenet-import.py   | 2 +-
 frontend/Python/graph/transform/__init__.py | 2 +-
 frontend/Python/graph/transform/fuse_ops.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index aec4e5e561..d6bcd30b8a 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -27,7 +27,7 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import cpu_fuse, gpu_fuse, custom_partition
+from buddy.compiler.graph.transform import simply_fuse, gpu_fuse, custom_partition
 from buddy.compiler.graph.type import DeviceType
 from buddy.compiler.ops import tosa, gpu
 from buddy.compiler.graph.json_decoder import json_to_graph
diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py
index 427d266b95..95428b3367 100644
--- a/frontend/Python/graph/transform/__init__.py
+++ b/frontend/Python/graph/transform/__init__.py
@@ -18,5 +18,5 @@
 #
 # ===---------------------------------------------------------------------------
 
-from .fuse_ops import cpu_fuse, gpu_fuse, custom_partition
+from .fuse_ops import simply_fuse, gpu_fuse, custom_partition
 from .useless_op_eliminate import maxpool2d_simplify
diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py
index e0ff806f52..7bfd2e8f98 100644
--- a/frontend/Python/graph/transform/fuse_ops.py
+++ b/frontend/Python/graph/transform/fuse_ops.py
@@ -29,7 +29,7 @@
 # ANCHOR_OP_TYPE = []
 
 
-def cpu_fuse(graph: Graph):
+def simply_fuse(graph: Graph):
     """
     Function to fuse all operations into one graph. Set the device type to CPU.
 

From bf7ca39b8b99737cae7b9d47d488011c5363d3b8 Mon Sep 17 00:00:00 2001
From: WuXintong123 <13683168028@163.com>
Date: Wed, 30 Oct 2024 05:34:56 +0000
Subject: [PATCH 29/29] final

---
 examples/BuddyLeNet/buddy-lenet-import.py | 17 ++++++-----------
 frontend/Python/graph/graph.py            |  5 +----
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py
index d6bcd30b8a..2ef14649e6 100644
--- a/examples/BuddyLeNet/buddy-lenet-import.py
+++ b/examples/BuddyLeNet/buddy-lenet-import.py
@@ -27,7 +27,11 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.graph import GraphDriver
-from buddy.compiler.graph.transform import simply_fuse, gpu_fuse, custom_partition
+from buddy.compiler.graph.transform import (
+    simply_fuse,
+    gpu_fuse,
+    custom_partition,
+)
 from buddy.compiler.graph.type import DeviceType
 from buddy.compiler.ops import tosa, gpu
 from buddy.compiler.graph.json_decoder import json_to_graph
@@ -61,15 +65,7 @@
 pattern_list = [custom_partition]
 graph.fuse_ops(pattern_list)
 path_prefix = os.path.dirname(os.path.abspath(__file__))
-
-# Convert the lenet graph to JSON string
-json_str = graph.to_json()
-with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file:
-    module_file.write(json_str)
-
-# Convert the lenet graph Json string to a lenet graph
-graph0 = json_to_graph(json_str)
-driver = GraphDriver(graph0)
+driver = GraphDriver(graph)
 driver.subgraphs[0].lower_to_top_level_ir()
 with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file:
     print(driver.subgraphs[0]._imported_module, file=module_file)
@@ -88,4 +84,3 @@
 )
 
 float32_param.tofile(Path(current_path) / "arg0.data")
-
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
index 3283beacee..ddf50f697c 100644
--- a/frontend/Python/graph/graph.py
+++ b/frontend/Python/graph/graph.py
@@ -198,13 +198,10 @@ def fuse_ops(self, pattern_list: List[FunctionType]):
         # TODO: discuss two fuse strategy
         # 1. fuse ops adapt for DSA(hardware dependent)
         # 2. common fuse strategy(hardware independent)
-
-        # Initialize operation groups
-        self.init_op_group()
-
         # Apply fusion patterns
         for pattern_func in pattern_list:
             pattern_func(self)
+        # Initialize operation groups
 
     def perform(self, func_list: List[FunctionType]):
         """