From cbbba151e5e71e0a0cec64e6ec84fe5c7d2de893 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Wed, 19 Jun 2024 15:43:16 +0800 Subject: [PATCH 01/29] temp --- env.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 env.sh diff --git a/env.sh b/env.sh new file mode 100644 index 0000000000..fccbb02918 --- /dev/null +++ b/env.sh @@ -0,0 +1,7 @@ +cd build/ +export BUDDY_MLIR_BUILD_DIR=$PWD +export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build +export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} + +export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/ +cd ../ \ No newline at end of file From 0858f29501b321372059431672e3026effd5dc02 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Wed, 19 Jun 2024 20:21:22 +0800 Subject: [PATCH 02/29] fix/maxpool2d_simplify --- frontend/Python/graph/operation.py | 9 +++++++++ .../graph/transform/useless_op_eliminate.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py index 14bfbf2752..6b543224e5 100644 --- a/frontend/Python/graph/operation.py +++ b/frontend/Python/graph/operation.py @@ -124,10 +124,19 @@ def args(self): @property def kwargs(self): return self._keyword_arguments + + @property + def parents(self): + return self._parents + + @property + def children(self): + return self._children @property def name(self): return self._name + @name.setter def name(self, new_name): self._name = new_name diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py index a99dbe02c6..2522e17984 100644 --- a/frontend/Python/graph/transform/useless_op_eliminate.py +++ b/frontend/Python/graph/transform/useless_op_eliminate.py @@ -42,13 +42,24 @@ def maxpool2d_simplify(graph: Graph): and getitem_node.args[1] == 0 ): new_node = MaxPool2dOp() - new_node.name = getitem_node.name + new_node.name = node.name.replace("_with_indices", "") for arg in node.args: new_node.add_argument(arg) for parent in node._parents: new_node.add_parent(parent) + parent_node = graph.node_table[parent] + for cindex, child in enumerate(parent_node.children): + if child == node.name: + parent_node.children[cindex] = new_node.name for child in getitem_node._children: new_node.add_children(child) + child_node = graph.node_table[child] + for pindex, parent in enumerate(child_node.parents): + if parent == getitem_node.name: + child_node.parents[pindex] = new_node.name + for aindex, arg in enumerate(child_node.args): + if arg == getitem_node.name: + child_node.args[aindex] = new_node.name new_node.tensor_meta["shape"] = getitem_node.tensor_meta[ "shape" ] @@ -63,4 +74,4 @@ def maxpool2d_simplify(graph: Graph): for j, op in enumerate(graph.body): if op == getitem_node: graph.body[j] = new_node - break + break \ No newline at end of file From f2fd5720a8f636636346eb37e85b9ec04b1915cb Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Wed, 19 Jun 2024 20:22:42 +0800 Subject: [PATCH 03/29] fix/maxpool2d_simplify --- env.sh | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 env.sh diff --git a/env.sh b/env.sh deleted file mode 100644 index fccbb02918..0000000000 --- a/env.sh +++ /dev/null @@ -1,7 +0,0 @@ -cd build/ -export BUDDY_MLIR_BUILD_DIR=$PWD -export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build -export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} - -export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/ -cd ../ \ No newline at end of file From b2c4c29128a342196551f5889cc9388d3eb8f010 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Thu, 20 Jun 2024 16:04:14 +0800 Subject: [PATCH 04/29] add json_encoder and json_decoder --- examples/BuddyLeNet/buddy-lenet-import.py | 16 ++++ examples/BuddyLeNet/graph.dot | 56 +++++++++++ examples/BuddyLeNet/lenet.json | 1 + frontend/Python/graph/graph.py | 83 +++++++++++++++++ frontend/Python/graph/json_decoder.py | 93 +++++++++++++++++++ frontend/Python/graph/operation.py | 8 ++ .../graph/transform/useless_op_eliminate.py | 15 ++- 7 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 examples/BuddyLeNet/graph.dot create mode 100644 examples/BuddyLeNet/lenet.json create mode 100644 frontend/Python/graph/json_decoder.py diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 95e76de253..76fcb32cf0 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -29,6 +29,7 @@ from buddy.compiler.graph import GraphDriver from buddy.compiler.graph.transform import simply_fuse from buddy.compiler.ops import tosa +from buddy.compiler.graph.json_decoder import json_to_graph from model import LeNet # Retrieve the LeNet model path from environment variables. @@ -74,3 +75,18 @@ ) float32_param.tofile(Path(current_path) / "arg0.data") + +# Convert the lenet graph to JSON string +json_str = graph.to_json() +with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file: + module_file.write(json_str) + +# Convert the lenet graph Json string to a lenet graph +graph0 = json_to_graph(json_str) +graph0.lower_to_top_level_ir() +print(graph0._imported_module) + +# Convert the lenet graph to DOT string +dot_str = graph.to_dot() +with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file: + module_file.write(dot_str) \ No newline at end of file diff --git a/examples/BuddyLeNet/graph.dot b/examples/BuddyLeNet/graph.dot new file mode 100644 index 0000000000..04313d9e35 --- /dev/null +++ b/examples/BuddyLeNet/graph.dot @@ -0,0 +1,56 @@ +// Buddy Graph +digraph { + arg0_1 -> convolution + arg1_1 -> convolution + arg2_1 -> convolution_1 + arg3_1 -> convolution_1 + arg4_1 -> permute + arg5_1 -> addmm + arg6_1 -> permute_1 + arg7_1 -> addmm_1 + arg8_1 -> permute_2 + arg9_1 -> addmm_2 + arg10_1 -> convolution + convolution -> relu + relu -> max_pool2d + max_pool2d -> convolution_1 + convolution_1 -> relu_1 + relu_1 -> max_pool2d_1 + max_pool2d_1 -> view + view -> addmm + permute -> addmm + addmm -> relu_2 + relu_2 -> addmm_1 + permute_1 -> addmm_1 + addmm_1 -> relu_3 + relu_3 -> addmm_2 + permute_2 -> addmm_2 + addmm_2 -> output + arg0_1 [fillcolor=white shape=ellipse style=filled] + arg1_1 [fillcolor=white shape=ellipse style=filled] + arg2_1 [fillcolor=white shape=ellipse style=filled] + arg3_1 [fillcolor=white shape=ellipse style=filled] + arg4_1 [fillcolor=white shape=ellipse style=filled] + arg5_1 [fillcolor=white shape=ellipse style=filled] + arg6_1 [fillcolor=white shape=ellipse style=filled] + arg7_1 [fillcolor=white shape=ellipse style=filled] + arg8_1 [fillcolor=white shape=ellipse style=filled] + arg9_1 [fillcolor=white shape=ellipse style=filled] + arg10_1 [fillcolor=white shape=ellipse style=filled] + convolution [fillcolor=deepskyblue shape=box style=filled] + relu [fillcolor=deepskyblue shape=box style=filled] + max_pool2d [fillcolor=red shape=box style=filled] + convolution_1 [fillcolor=deepskyblue shape=box style=filled] + relu_1 [fillcolor=deepskyblue shape=box style=filled] + max_pool2d_1 [fillcolor=red shape=box style=filled] + view [fillcolor=deepskyblue shape=box style=filled] + permute [fillcolor=deepskyblue shape=box style=filled] + addmm [fillcolor=deepskyblue shape=box style=filled] + relu_2 [fillcolor=deepskyblue shape=box style=filled] + permute_1 [fillcolor=deepskyblue shape=box style=filled] + addmm_1 [fillcolor=deepskyblue shape=box style=filled] + relu_3 [fillcolor=deepskyblue shape=box style=filled] + permute_2 [fillcolor=deepskyblue shape=box style=filled] + addmm_2 [fillcolor=deepskyblue shape=box style=filled] + output [fillcolor=white shape=ellipse style=filled] +} diff --git a/examples/BuddyLeNet/lenet.json b/examples/BuddyLeNet/lenet.json new file mode 100644 index 0000000000..11171f91ac --- /dev/null +++ b/examples/BuddyLeNet/lenet.json @@ -0,0 +1 @@ +{"graph_name": "forward", "nodes": [{"name": "arg0_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6, 1, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg1_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg2_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16, 6, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg3_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg4_1", "children": ["permute"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 256], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg5_1", "children": ["addmm"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg6_1", "children": ["permute_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg7_1", "children": ["addmm_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg8_1", "children": ["permute_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10, 84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg9_1", "children": ["addmm_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg10_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 1, 28, 28], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}], "device": "cpu", "params": [{"shape": [6, 1, 5, 5], "dtype": "Float32"}, {"shape": [6], "dtype": "Float32"}, {"shape": [16, 6, 5, 5], "dtype": "Float32"}, {"shape": [16], "dtype": "Float32"}, {"shape": [120, 256], "dtype": "Float32"}, {"shape": [120], "dtype": "Float32"}, {"shape": [84, 120], "dtype": "Float32"}, {"shape": [84], "dtype": "Float32"}, {"shape": [10, 84], "dtype": "Float32"}, {"shape": [10], "dtype": "Float32"}], "inputs": [{"shape": [1, 1, 28, 28], "dtype": "Float32"}], "subgraphs": {"subgraph0": [{"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}]}, "subgraph_map_device": {"subgraph0": "UNKNOW"}} \ No newline at end of file diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index eb78c0ff33..898f967b63 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -23,6 +23,8 @@ import ctypes import functools import numpy as np +import graphviz +import json import mlir.ir as ir import mlir.dialects.func as func @@ -324,6 +326,87 @@ def compile(self): self.lower_to_top_level_ir() self.lower_to_llvm_ir() + def to_dot(self): + """ + Converts a buddy graph to a DOT string for visualization. + + Returns: + str: A DOT string representing the buddy graph for visualization. + """ + dot = graphviz.Digraph(comment='Buddy Graph') + for op in self._body: + # if isinstance(op, PlaceholderOp): + # continue + for child in op._children: + dot.edge(op._name, child) + for op in self._body: + if isinstance(op, PlaceholderOp): + dot.node(op._name, shape="ellipse", fillcolor="white", style="filled") + # continue + elif isinstance(op, OutputOp): + dot.node(op._name, shape="ellipse", fillcolor="white", style="filled") + elif isinstance(op, MaxPool2dOp): + dot.node(op._name, shape="box", fillcolor="red", style="filled") + else: + dot.node(op._name, shape="box", fillcolor="deepskyblue", style="filled") + return str(dot) + + def to_json(self): + """ + Converts a buddy graph to a JSON string. + + Returns: + str: A JSON string representing the buddy graph. + """ + json_str = json.dumps(self, cls=BuddyGraphEncoder) + return json_str + + +class BuddyGraphEncoder(json.JSONEncoder): + """ + Custom JSON encoder for converting Buddy Graph objects to JSON strings. + + This encoder handles encoding of Graph, Op, TensorMeta, OpType, TensorDType, + and DeviceType objects to their JSON representation. + + Returns: + JSONEncoder: A JSON encoder instance for Buddy Graph objects. + """ + def default(self, obj): + if isinstance(obj, Graph): + return { + 'graph_name' : obj._func_name, + 'nodes' : obj._body, + 'device' : obj.device, + 'params' : obj._fake_params, + 'inputs' : obj._inputs, + 'subgraphs' : obj.op_groups, + 'subgraph_map_device' : obj.group_map_device + } + elif isinstance(obj, Op): + return { + 'name' : obj._name, + 'children' : obj._children, + 'parents' : obj._parents, + 'arguments' : obj._arguments, + 'keyword_arguments' : obj._keyword_arguments, + 'tensor_meta' : obj._tensor_meta, + 'type' : obj._op_type, + 'class' : obj.__class__.__name__ + } + elif isinstance(obj, TensorMeta): + return { + 'shape' : obj.shape, + 'dtype' : obj.dtype + } + elif isinstance(obj, OpType): + return obj._name_ + elif isinstance(obj, TensorDType): + return obj._name_ + elif isinstance(obj, DeviceType): + return obj._name_ + else: + return super().default(obj) class GraphImporter: """ diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py new file mode 100644 index 0000000000..70e5112c32 --- /dev/null +++ b/frontend/Python/graph/json_decoder.py @@ -0,0 +1,93 @@ +import json +from pathlib import Path + +from .graph import Graph, TensorDType, TensorMeta +from .graph_driver import GraphDriver +from .operation import * +from .type import * + +from ..ops.linalg import ops_registry as linalg_ops_registry +from ..ops.tosa import ops_registry as tosa_ops_registry +from ..ops.math import ops_registry as math_ops_registry +from ..ops.func import ops_registry as func_ops_registry + +def json_to_graph(json_str): + """ + Converts a buddy graph JSON string to a Graph object. + + Args: + json_str (str): The JSON string representing the buddy graph. + + Returns: + Graph: The Graph object created from the JSON data. + """ + def json_to_tensormeta(json_data): + """ + Convert JSON data to a TensorMeta object. + + Args: + json_data (dict): JSON data representing a TensorMeta object. + + Returns: + TensorMeta: The TensorMeta object created from the JSON data. + """ + if 'shape' in json_data: + shape = json_data['shape'] + dtype = next((member for member in TensorDType.__members__.values() if member.value.upper() == json_data['dtype'].upper()), None) + return TensorMeta(shape, dtype) + return {} + + json_data = json.loads(json_str) + _graph = json_data + graph_name = _graph['graph_name'] + inputs = [] + params = [] + for _input in _graph['inputs']: + inputs.append(json_to_tensormeta(_input)) + for _param in _graph['params']: + params.append(json_to_tensormeta(_param)) + ops_registry = {} + ops_registry.update(func_ops_registry) + ops_registry.update(linalg_ops_registry) + ops_registry.update(tosa_ops_registry) + ops_registry.update(math_ops_registry) + graph = Graph(inputs, params, ops_registry, graph_name) + graph.device = _graph['device'] + for _node in _graph['nodes']: + op_class = _node['class'] + op = globals()[op_class]() + + op._name = _node['name'] + op._children = _node['children'] + op._parents = _node['parents'] + op._arguments = _node['arguments'] + op._keyword_arguments = _node['keyword_arguments'] + op._type = next((member for member in OpType.__members__.values() if member.value == _node['type']), None) + + # TODO : node attr tensor_meta should be Class TensorMeta + if ('shape' not in _node['tensor_meta']): + op._tensor_meta = _node['tensor_meta'] + else: + op._tensor_meta = { + 'shape' : _node['tensor_meta']['shape'], + 'dtype' : next((member for member in TensorDType.__members__.values() if member.value.upper() == _node['tensor_meta']['dtype'].upper()), None) + } + graph.add_node(op) + + for subgraph_name, subgraph_body in _graph['subgraphs'].items(): + subgraph_ops = [] + for subgraph_node in subgraph_body: + op_name = subgraph_node['name'] + op = graph.node_table[op_name] + subgraph_ops.append(op) + graph.op_groups[subgraph_name] = subgraph_ops + + for subgraph_name, subgraph_device in _graph['subgraph_map_device'].items(): + if subgraph_device == 'CPU': + graph.group_map_device[subgraph_name] = DeviceType.CPU + elif subgraph_device == 'GPU': + graph.group_map_device[subgraph_name] = DeviceType.GPU + else: + graph.group_map_device[subgraph_name] = DeviceType.UNKNOW + + return graph diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py index 14bfbf2752..7632397bd5 100644 --- a/frontend/Python/graph/operation.py +++ b/frontend/Python/graph/operation.py @@ -124,6 +124,14 @@ def args(self): @property def kwargs(self): return self._keyword_arguments + + @property + def parents(self): + return self._parents + + @property + def children(self): + return self._children @property def name(self): diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py index a99dbe02c6..2522e17984 100644 --- a/frontend/Python/graph/transform/useless_op_eliminate.py +++ b/frontend/Python/graph/transform/useless_op_eliminate.py @@ -42,13 +42,24 @@ def maxpool2d_simplify(graph: Graph): and getitem_node.args[1] == 0 ): new_node = MaxPool2dOp() - new_node.name = getitem_node.name + new_node.name = node.name.replace("_with_indices", "") for arg in node.args: new_node.add_argument(arg) for parent in node._parents: new_node.add_parent(parent) + parent_node = graph.node_table[parent] + for cindex, child in enumerate(parent_node.children): + if child == node.name: + parent_node.children[cindex] = new_node.name for child in getitem_node._children: new_node.add_children(child) + child_node = graph.node_table[child] + for pindex, parent in enumerate(child_node.parents): + if parent == getitem_node.name: + child_node.parents[pindex] = new_node.name + for aindex, arg in enumerate(child_node.args): + if arg == getitem_node.name: + child_node.args[aindex] = new_node.name new_node.tensor_meta["shape"] = getitem_node.tensor_meta[ "shape" ] @@ -63,4 +74,4 @@ def maxpool2d_simplify(graph: Graph): for j, op in enumerate(graph.body): if op == getitem_node: graph.body[j] = new_node - break + break \ No newline at end of file From d09dd7370f7cd6e69723bfe1055ec3cd94c287f5 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Thu, 20 Jun 2024 16:05:38 +0800 Subject: [PATCH 05/29] add json_encoder and json_decoder --- env.sh | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 env.sh diff --git a/env.sh b/env.sh deleted file mode 100644 index fccbb02918..0000000000 --- a/env.sh +++ /dev/null @@ -1,7 +0,0 @@ -cd build/ -export BUDDY_MLIR_BUILD_DIR=$PWD -export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build -export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} - -export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/ -cd ../ \ No newline at end of file From 4e779240ea549758e284d3ed7db8510894b4dd0c Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Wed, 31 Jul 2024 07:35:01 +0000 Subject: [PATCH 06/29] add gpu.container_module --- frontend/Python/graph/graph.py | 6 ++- frontend/Python/ops/gpu.py | 97 ++++++++++++++++++++++++++++++++++ frontend/Python/ops/utils.py | 2 + 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 frontend/Python/ops/gpu.py diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index eb78c0ff33..ea71a925c7 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -361,6 +361,7 @@ def __init__( ops_registry = {} self._symbol_table = {} self._body = body + self._device = DeviceType.GPU self._func_name = func_name self._params = params self._inputs = inputs @@ -440,7 +441,7 @@ def import_graph(self) -> ir.Module: shape_list = list(arg.shape) dtype = arg.dtype mlir_dtype = self._str_to_mlir_dtype(dtype) - tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype) + tensor_arg = ir.MemrefType.get(shape_list, mlir_dtype) arguments.append(tensor_arg) extern_func = [] for node in self._body: @@ -473,6 +474,9 @@ def generated_func(*args): self._import_op(node) return self._symbol_table.get(("output", 0)) + + if self._device == DeviceType.GPU: + self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get() return self._module diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py new file mode 100644 index 0000000000..e0d02ab492 --- /dev/null +++ b/frontend/Python/ops/gpu.py @@ -0,0 +1,97 @@ +# ===- func.py ----------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# The registry of mappings from Buddy node to MLIR GPU kernel. +# +# ===--------------------------------------------------------------------------- + + +from typing import Tuple +import mlir.ir as ir +from mlir.dialects import gpu, memref, arith, scf + +from ..graph import TensorDType +from ..graph import ( + ReluOp +) +from .utils import * + +def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): + """ + Import the buddy ReluOp. + From Buddy ReluOp to MLIR Relu GPU kernel. + """ + assert len(node.args) == 1 + input = symbol_table.get((str(node.args[0]), 0)) + if input is None: + return + output_shape = list(node.tensor_meta["shape"]) + dtype = node.tensor_meta["dtype"] + element = mlir_element_attr_get(dtype, 0) + memref_type = ir.MemrefType.get(output_shape, element.type) + unranked_memref_type = ir.UnrankedMemRefType.get(dtype, ir.IntegerAttr.get(ir.IndexType.get(), 0)) + input_cast = memref.CastOp(unranked_memref_type, input) + gpu.HostRegisterOp(input_cast) + + c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) + c512 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) + size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1024)) + + gpu_kernel = gpu.LaunchOp( + asyncToken=None, + asyncDependencies=[], + gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result, + blockSizeX=c512.result, blockSizeY=c1.result, blockSizeZ=c1.result, + ) + # Create a GPU kernel block and define grid and block dimensions for GPU execution + gpu_kernel_block = ir.Block.create_at_start( + gpu_kernel.body, + [ + ir.IndexType.get(), # %bx : index, Block index X + ir.IndexType.get(), # %by : index, Block index Y + ir.IndexType.get(), # %bz : index, Block index Z + ir.IndexType.get(), # %tx : index, Thread index X + ir.IndexType.get(), # %ty : index, Thread index Y + ir.IndexType.get(), # %tz : index, Thread index Z + ir.IndexType.get(), # %num_bx : index, Grid size X + ir.IndexType.get(), # %num_by : index, Grid size Y + ir.IndexType.get(), # %num_bz : index, Grid size Z + ir.IndexType.get(), # %num_tx : index, Block size X + ir.IndexType.get(), # %num_ty : index, Block size Y + ir.IndexType.get(), # %num_tz : index, Block size Z + ] + ) + + with ir.InsertionPoint(gpu_kernel_block): + tIdX = gpu_kernel_block.arguments[3] + cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0)) + for1 = scf.ForOp( + lower_bound=tIdX, + upper_bound=size, + step=gpu_kernel.blockSizeX + ) + with ir.InsertionPoint(for1.body): + load = memref.LoadOp(arg0, [for1.induction_variable]) + result = arith.MaxNumFOp(load, cst_0) + memref.StoreOp(result, arg0, [for1.induction_variable]) + scf.YieldOp([]) + + gpu.TerminatorOp() + return op + +ops_registry = { + ReluOp: relu_op +} diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py index 337f5a6b49..1217c6af2c 100644 --- a/frontend/Python/ops/utils.py +++ b/frontend/Python/ops/utils.py @@ -54,3 +54,5 @@ def mlir_element_attr_get(type_name, value): case TensorDType.Bool: return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value) + +def tensor_shape_size() \ No newline at end of file From 43b36243519e1fd5e98fa9e92a61b80ea1535685 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Mon, 12 Aug 2024 09:09:49 +0000 Subject: [PATCH 07/29] [frontend] Add GPU MLIR lowering path with ReLU operation support --- examples/BuddyTest/.gitignore | 2 + examples/BuddyTest/import-test.py | 55 ++++++++++++ examples/BuddyTest/model.py | 30 +++++++ frontend/Python/graph/graph.py | 5 +- frontend/Python/ops/gpu.py | 140 ++++++++++++++++++++++++------ frontend/Python/ops/utils.py | 43 ++++++++- 6 files changed, 244 insertions(+), 31 deletions(-) create mode 100644 examples/BuddyTest/.gitignore create mode 100644 examples/BuddyTest/import-test.py create mode 100644 examples/BuddyTest/model.py diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore new file mode 100644 index 0000000000..6e9797bbe9 --- /dev/null +++ b/examples/BuddyTest/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +forward.mlir diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py new file mode 100644 index 0000000000..b47bba9b21 --- /dev/null +++ b/examples/BuddyTest/import-test.py @@ -0,0 +1,55 @@ +# ===- buddy-lenet-import.py --------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This is the Test model AOT importer. +# +# ===--------------------------------------------------------------------------- + +import os +from pathlib import Path + +import numpy as np +import torch +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.graph import GraphDriver +from buddy.compiler.graph.transform import simply_fuse +from buddy.compiler.ops.gpu import ops_registry as gpu_ops_registry +from model import TestModule + +model = TestModule() +model = model.eval() + +# Initialize Dynamo Compiler with specific configurations as an importer. +dynamo_compiler = DynamoCompiler( + primary_registry=gpu_ops_registry, + aot_autograd_decomposition=inductor_decomp, +) + +data = torch.randn([1, 1, 28, 28]) +# Import the model into MLIR module and parameters. +with torch.no_grad(): + graphs = dynamo_compiler.importer(model, data) + +assert len(graphs) == 1 +graph = graphs[0] +print(graph._body) +graph.lower_to_top_level_ir() +path_prefix = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: + print(graph._imported_module, file=module_file) + \ No newline at end of file diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py new file mode 100644 index 0000000000..67f3bfdafd --- /dev/null +++ b/examples/BuddyTest/model.py @@ -0,0 +1,30 @@ +# ===- model.py ---------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# Test model definition. +# +# ===--------------------------------------------------------------------------- + +import torch +import torch.nn as nn + +class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + + def forward(self, x): + x = torch.relu(x.view(2, 14, 28)) + return x.permute([1, 2, 0]) diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index ea71a925c7..7c99b4391d 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -244,7 +244,7 @@ def lower_to_top_level_ir(self): output_ranks = [] output_dtypes = [] for out_node in outputs: - out_type = ir.RankedTensorType(out_node.type) + out_type = ir.MemRefType(out_node.type) shape = list(out_type.shape) dtype = out_type.element_type match str(dtype): @@ -441,7 +441,7 @@ def import_graph(self) -> ir.Module: shape_list = list(arg.shape) dtype = arg.dtype mlir_dtype = self._str_to_mlir_dtype(dtype) - tensor_arg = ir.MemrefType.get(shape_list, mlir_dtype) + tensor_arg = ir.MemRefType.get(shape_list, mlir_dtype) arguments.append(tensor_arg) extern_func = [] for node in self._body: @@ -588,6 +588,7 @@ def _import_op(self, node: Op): node (Op): The buddy node representing the operation. """ + op_name = node.__class__.__name__ op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( self._ops_registry[op_name](node, self._symbol_table) diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index e0d02ab492..2eff78f97d 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -1,4 +1,4 @@ -# ===- func.py ----------------------------------------------------------------- +# ===- gpu.py ----------------------------------------------------------------- # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,7 +25,9 @@ from ..graph import TensorDType from ..graph import ( - ReluOp + ReluOp, + ReshapeOp, + PermuteOp ) from .utils import * @@ -40,58 +42,140 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): return output_shape = list(node.tensor_meta["shape"]) dtype = node.tensor_meta["dtype"] - element = mlir_element_attr_get(dtype, 0) - memref_type = ir.MemrefType.get(output_shape, element.type) - unranked_memref_type = ir.UnrankedMemRefType.get(dtype, ir.IntegerAttr.get(ir.IndexType.get(), 0)) - input_cast = memref.CastOp(unranked_memref_type, input) - gpu.HostRegisterOp(input_cast) + element_type = mlir_element_type_get(dtype) + c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) - c512 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) - size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1024)) + kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) + # Flatten the input into a one-dimensional format + output_size = tensor_shape_size(output_shape) + size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size)) + shape = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) + memref.StoreOp(size, shape, [c0]) + memref_reshape_type = ir.MemRefType.get([output_size], element_type) + input_reshape = memref.ReshapeOp(memref_reshape_type, input, shape) + + unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) + input_cast = memref.CastOp(unranked_memref_type, input) + gpu.HostRegisterOp(input_cast) gpu_kernel = gpu.LaunchOp( asyncToken=None, asyncDependencies=[], gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result, - blockSizeX=c512.result, blockSizeY=c1.result, blockSizeZ=c1.result, + blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result, ) - # Create a GPU kernel block and define grid and block dimensions for GPU execution gpu_kernel_block = ir.Block.create_at_start( gpu_kernel.body, [ - ir.IndexType.get(), # %bx : index, Block index X - ir.IndexType.get(), # %by : index, Block index Y - ir.IndexType.get(), # %bz : index, Block index Z - ir.IndexType.get(), # %tx : index, Thread index X - ir.IndexType.get(), # %ty : index, Thread index Y - ir.IndexType.get(), # %tz : index, Thread index Z - ir.IndexType.get(), # %num_bx : index, Grid size X - ir.IndexType.get(), # %num_by : index, Grid size Y - ir.IndexType.get(), # %num_bz : index, Grid size Z - ir.IndexType.get(), # %num_tx : index, Block size X - ir.IndexType.get(), # %num_ty : index, Block size Y - ir.IndexType.get(), # %num_tz : index, Block size Z + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_idx, block_idy, block_idz + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # thread_idx , thread_idy, thread_idz + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # grid_size x, grid_size y, grid_size z + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_size x, block_size y, block_size z ] ) with ir.InsertionPoint(gpu_kernel_block): tIdX = gpu_kernel_block.arguments[3] cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0)) - for1 = scf.ForOp( + loop = scf.ForOp( lower_bound=tIdX, upper_bound=size, step=gpu_kernel.blockSizeX ) - with ir.InsertionPoint(for1.body): - load = memref.LoadOp(arg0, [for1.induction_variable]) + with ir.InsertionPoint(loop.body): + load = memref.LoadOp(input_reshape, [loop.induction_variable]) result = arith.MaxNumFOp(load, cst_0) - memref.StoreOp(result, arg0, [for1.induction_variable]) + memref.StoreOp(result, input_reshape, [loop.induction_variable]) scf.YieldOp([]) gpu.TerminatorOp() + output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + memref.CopyOp(input, output) + return output + +# TODO: Implement Reshape Operation on GPU in future revisions. + +def reshape_op(node: ReshapeOp, symbol_table): + """ + Import the reshape operation. + From buddy graph ir's `ReshapeOp` operator to MLIR Memref `reshape` + operation. + + Note: If the new shape contains one and only one `-1`, the size of the new + shape will be inferred automatically. + """ + input1 = symbol_table.get((str(node.args[0]), 0)) + new_shape = [] + for i in node.args[1]: + new_shape.append(i) + output_shape = list(node.tensor_meta["shape"]) + total_size = tensor_shape_size(output_shape) + + neg_one_cnt = 0 + rest_size = 1 + for dim_siz in new_shape: + if dim_siz == -1: + neg_one_cnt += 1 + continue + rest_size *= dim_siz + + if neg_one_cnt != 0: + if neg_one_cnt > 1 or total_size % rest_size != 0: + raise ValueError("Can not infer the new shape!") + infer_dim_size = total_size // rest_size + for i, _ in enumerate(new_shape): + if new_shape[i] == -1: + new_shape[i] = infer_dim_size + + shape = memref.AllocOp(ir.MemRefType.get([len(new_shape)], ir.IndexType.get()), [], []) + for i, _ in enumerate(new_shape): + c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i)) + size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), new_shape[i])) + memref.StoreOp(size, shape, [c]) + + dtype = node.tensor_meta["dtype"] + element_type = mlir_element_type_get(dtype) + output_type = ir.MemRefType.get(new_shape, element_type) + op = memref.ReshapeOp(output_type, input1, shape) + return op +# TODO: Implement Permute Operation on GPU in future revisions. + +def permute_op(node: PermuteOp, symbol_table): + """ + Import the permute operation. + From buddy graph ir's `PermuteOp` operator to MLIR Memref `transpose` + operation. + """ + input1 = symbol_table.get((str(node.args[0]), 0)) + perm = node.args[1] + perm_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm)) + + output_shape = list(node.tensor_meta["shape"]) + element_type = mlir_element_type_get(node.tensor_meta["dtype"]) + input_shape = [0] * len(output_shape) + for i, p in enumerate(perm): + input_shape[p] = output_shape[i] + + offset = 0 + input_stride = generate_strides(input_shape) + output_stride = transpose_strides(input_stride, perm) + result_type = ir.MemRefType.get( + shape=output_shape, + element_type=element_type, + layout=ir.StridedLayoutAttr.get(offset, output_stride) + ) + permute_op = memref.TransposeOp( + result=result_type, + in_=input1, + permutation=perm_attr + ) + return permute_op + ops_registry = { - ReluOp: relu_op + "ReluOp": relu_op, + "ViewOp": reshape_op, + "PermuteOp": permute_op } diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py index 1217c6af2c..2b2dfe4ca2 100644 --- a/frontend/Python/ops/utils.py +++ b/frontend/Python/ops/utils.py @@ -55,4 +55,45 @@ def mlir_element_attr_get(type_name, value): return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value) -def tensor_shape_size() \ No newline at end of file +def tensor_shape_size(shape): + """ + Calculate the product of all dimensions in the given shape list, + which represents the size of the tensor. + Args: + shape: A list containing the sizes of each dimension of the tensor. + """ + size = 1 + for dim in shape: + size *= dim + return size + +def generate_strides(shape): + """ + Generate strides based on the input matrix shape. + + Args: + shape (list[int]): The shape of the input matrix, e.g., [2, 3, 4]. + + Returns: + list[int]: The corresponding strides, e.g., [12, 4, 1]. + """ + strides = [] + stride = 1 + for dim in reversed(shape): + strides.insert(0, stride) + stride *= dim + return strides + +def transpose_strides(strides, permutation): + """ + Reorder strides based on the input permutation. + + Args: + strides (list[int]): The original strides list, e.g., [12, 4, 1]. + permutation (list[int]): The permutation order, e.g., [1, 2, 0]. + + Returns: + list[int]: The reordered strides list, e.g., [4, 1, 12]. + """ + transposed_strides = [strides[i] for i in permutation] + return transposed_strides From 2d4eef1c9ea58e807437312a6d3e4771b2399cff Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Mon, 12 Aug 2024 09:17:20 +0000 Subject: [PATCH 08/29] delete env.sh --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 485cccfcf9..69426a81de 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ # Clangd cache .cache + +# environment bash +env.sh \ No newline at end of file From 78f6bca5125ce773d79ce7c16a2d36fc416bd1af Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Mon, 12 Aug 2024 09:19:16 +0000 Subject: [PATCH 09/29] delete env.sh --- env.sh | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 env.sh diff --git a/env.sh b/env.sh deleted file mode 100644 index fccbb02918..0000000000 --- a/env.sh +++ /dev/null @@ -1,7 +0,0 @@ -cd build/ -export BUDDY_MLIR_BUILD_DIR=$PWD -export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build -export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} - -export LENET_EXAMPLE_PATH=${BUDDY_MLIR_BUILD_DIR}/../examples/BuddyLeNet/ -cd ../ \ No newline at end of file From abce38285b8176164e826d81810f10437a0f3d32 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Fri, 16 Aug 2024 01:50:39 +0000 Subject: [PATCH 10/29] [BuddyTest] Add Test Model E2E example. --- examples/BuddyTest/.gitignore | 3 +- examples/BuddyTest/CMakeLists.txt | 29 ++++++++++++ examples/BuddyTest/import-test.py | 3 +- examples/BuddyTest/makefile | 38 +++++++++++++++ examples/BuddyTest/model.py | 4 +- examples/BuddyTest/test-main.cpp | 79 +++++++++++++++++++++++++++++++ examples/CMakeLists.txt | 4 ++ 7 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 examples/BuddyTest/CMakeLists.txt create mode 100644 examples/BuddyTest/makefile create mode 100644 examples/BuddyTest/test-main.cpp diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore index 6e9797bbe9..081f173509 100644 --- a/examples/BuddyTest/.gitignore +++ b/examples/BuddyTest/.gitignore @@ -1,2 +1,3 @@ __pycache__ -forward.mlir +*.mlir +log.ll diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt new file mode 100644 index 0000000000..2e3654b347 --- /dev/null +++ b/examples/BuddyTest/CMakeLists.txt @@ -0,0 +1,29 @@ +add_custom_command( + OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir + COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyTest/import-test.py + COMMENT "Generating forward.mlir" +) + + +add_custom_command( + OUTPUT forward.o + COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llvm-as | + ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir + COMMENT "Building forward.o" + VERBATIM) + + +add_library(TEST STATIC forward.o) + +SET_TARGET_PROPERTIES(TEST PROPERTIES LINKER_LANGUAGE C) + +add_executable(buddy-test-run test-main.cpp) +target_link_directories(buddy-test-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) + +set(BUDDY_TEST_LIBS TEST mlir_runner_utils mlir_cuda_runtime) +target_link_libraries(buddy-test-run ${BUDDY_TEST_LIBS}) diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py index b47bba9b21..3cd2573830 100644 --- a/examples/BuddyTest/import-test.py +++ b/examples/BuddyTest/import-test.py @@ -47,9 +47,8 @@ assert len(graphs) == 1 graph = graphs[0] -print(graph._body) graph.lower_to_top_level_ir() path_prefix = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: print(graph._imported_module, file=module_file) - \ No newline at end of file + \ No newline at end of file diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile new file mode 100644 index 0000000000..7b9dd646f9 --- /dev/null +++ b/examples/BuddyTest/makefile @@ -0,0 +1,38 @@ +#!/bin/bash +BUDDY_OPT := ../../build/bin/buddy-opt +MLIR_OPT := ../../llvm/build/bin/mlir-opt +MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate +MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner +LLC := ../../llvm/build/bin/llc +OPT_FLAG := -O0 + +ifeq ($(shell uname),Linux) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so +MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so +MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so +MTRIPLE := x86_64-unknown-linux-gnu +else ifeq ($(shell uname),Darwin) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib +MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib +MTRIPLE := x86_64-apple-darwin +endif + +gpu-test-lower: + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_OPT} -o log.mlir + +gpu-test-translate: + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll + +gpu-test-run: + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py index 67f3bfdafd..fed677d6be 100644 --- a/examples/BuddyTest/model.py +++ b/examples/BuddyTest/model.py @@ -26,5 +26,5 @@ def __init__(self): super(TestModule, self).__init__() def forward(self, x): - x = torch.relu(x.view(2, 14, 28)) - return x.permute([1, 2, 0]) + x = torch.relu(x.view(28, 28)) + return x.permute([1,0]) diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp new file mode 100644 index 0000000000..e53c25192c --- /dev/null +++ b/examples/BuddyTest/test-main.cpp @@ -0,0 +1,79 @@ +//===- test-main.cpp ------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +using namespace buddy; + +extern "C" void +_mlir_ciface_forward(MemRef *result, MemRef *input); + +int main() { + /// Initialize data containers. + MemRef input({1, 1, 28, 28}); + MemRef result({28, 28}); + + for (int i = 0; i < 28; i++) { + for (int j = 0; j < 28; j++) { + int index = i * 28 + j; + input[index] = static_cast(index); + } + } + // Print the generated data to verify + for (int i = 0; i < 28; i++) { + for (int j = 0; j < 28; j++) { + std::cout << input[i * 28 + j] << " "; + } + std::cout << std::endl; + } + + const auto inferenceStart = std::chrono::high_resolution_clock::now(); + + /// Execute forward inference of the model. + _mlir_ciface_forward(&result, &input); + + const auto inferenceEnd = std::chrono::high_resolution_clock::now(); + const std::chrono::duration inferenceTime = + inferenceEnd - inferenceStart; + + /// Print the output data for verification. + std::cout << "\033[33;1m[Output] \033[0m"; + std::cout << "["; + for (int i = 0; i < 28; i++) { + if (i > 0) std::cout << " "; + std::cout << "["; + for (int j = 0; j < 28; j++) { + if (j > 0) std::cout << " "; + std::cout << result[i * 28 + j]; + } + std::cout << "]"; + if (i < 27) std::cout << "\n "; + } + std::cout << "]" << std::endl; + + /// Print the performance. + std::cout << "\033[33;1m[Time] \033[0m"; + std::cout << inferenceTime.count() << " ms" + << std::endl; + + return 0; +} diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0b575f3f4a..a9c0a54e30 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,6 +16,10 @@ if (BUDDY_LENET_EXAMPLES) add_subdirectory(BuddyLeNet) endif() +if (BUDDY_TEST_EXAMPLES) + add_subdirectory(BuddyTest) +endif() + if (BUDDY_MOBILENETV3_EXAMPLES) add_subdirectory(BuddyMobileNetV3) endif() From 3d00fe6b58f51e14bde743dd8a0d4cac1372960b Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Fri, 16 Aug 2024 03:44:09 +0000 Subject: [PATCH 11/29] [BuddyTest] Add README. --- examples/BuddyTest/README | 65 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 examples/BuddyTest/README diff --git a/examples/BuddyTest/README b/examples/BuddyTest/README new file mode 100644 index 0000000000..49cb8fa64f --- /dev/null +++ b/examples/BuddyTest/README @@ -0,0 +1,65 @@ +# Buddy Compiler Test Example + +0. Activate your python environment. + +1. Build LLVM/MLIR + +```bash +$ cd buddy-mlir +$ mkdir llvm/build +$ cd llvm/build +$ cmake -G Ninja ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \ + -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \ + -DMLIR_ENABLE_CUDA_RUNNER=ON \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ + -DPython3_EXECUTABLE=$(which python3) +$ ninja check-clang check-mlir omp +``` + +2. Build buddy-mlir + +```bash +$ mkdir build && cd build +$ cmake -G Ninja .. \ + -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \ + -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \ + -DPython3_EXECUTABLE=$(which python3) +$ ninja +$ ninja check-buddy +``` + +3. Set the `PYTHONPATH` environment variable. + +Make sure you are in the build directory. + +```bash +$ export BUDDY_MLIR_BUILD_DIR=$PWD +$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build +$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} +``` + +4. Build and run the Test example + +```bash +$ cmake -G Ninja .. -DBUDDY_TEST_EXAMPLES=ON +$ ninja buddy-test-run +$ cd bin +$ ./buddy-test-run +``` + +## Debug the Lowering Pass Pipeline with Fake Parameters. + +```bash +$ cd buddy-mlir +$ cd examples/BuddyTest +$ make gpu-test-lower +$ make gpu-test-translate +$ make gpu-test-run +``` From ae794aaa63d187aee49ec4c146728ce10af35eb3 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Fri, 16 Aug 2024 03:45:13 +0000 Subject: [PATCH 12/29] [BuddyTest] Add README. --- examples/BuddyTest/{README => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/BuddyTest/{README => README.md} (100%) diff --git a/examples/BuddyTest/README b/examples/BuddyTest/README.md similarity index 100% rename from examples/BuddyTest/README rename to examples/BuddyTest/README.md From b57103c4cd85b873d7459288b785379058160c4b Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Fri, 30 Aug 2024 08:30:23 +0000 Subject: [PATCH 13/29] [frontend] Add GPU MLIR lowering path with Conv2d operation support --- examples/BuddyTest/README.md | 2 +- examples/BuddyTest/import-test.py | 2 +- examples/BuddyTest/makefile | 18 +++ examples/BuddyTest/model.py | 5 +- examples/BuddyTest/test-main.cpp | 66 +++++--- frontend/Python/ops/gpu.py | 240 ++++++++++++++++++++++++++++-- 6 files changed, 299 insertions(+), 34 deletions(-) diff --git a/examples/BuddyTest/README.md b/examples/BuddyTest/README.md index 49cb8fa64f..f057723bb3 100644 --- a/examples/BuddyTest/README.md +++ b/examples/BuddyTest/README.md @@ -10,7 +10,7 @@ $ mkdir llvm/build $ cd llvm/build $ cmake -G Ninja ../llvm \ -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \ - -DLLVM_TARGETS_TO_BUILD="host;RISCV;NVPTX" \ + -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \ -DMLIR_ENABLE_CUDA_RUNNER=ON \ -DLLVM_ENABLE_ASSERTIONS=ON \ -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py index 3cd2573830..e2e863a66c 100644 --- a/examples/BuddyTest/import-test.py +++ b/examples/BuddyTest/import-test.py @@ -40,7 +40,7 @@ aot_autograd_decomposition=inductor_decomp, ) -data = torch.randn([1, 1, 28, 28]) +data = torch.randn([1, 6, 32, 32]) # Import the model into MLIR module and parameters. with torch.no_grad(): graphs = dynamo_compiler.importer(model, data) diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile index 7b9dd646f9..02aba04064 100644 --- a/examples/BuddyTest/makefile +++ b/examples/BuddyTest/makefile @@ -36,3 +36,21 @@ gpu-test-run: ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} + +gpu-conv2d-lower: + @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_OPT} -o log.mlir + +gpu-conv2d-translate: + @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll + +gpu-conv2d-run: + @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py index fed677d6be..0439ed70f4 100644 --- a/examples/BuddyTest/model.py +++ b/examples/BuddyTest/model.py @@ -24,7 +24,8 @@ class TestModule(nn.Module): def __init__(self): super(TestModule, self).__init__() + self.conv1 = nn.Conv2d(6, 1, 5) def forward(self, x): - x = torch.relu(x.view(28, 28)) - return x.permute([1,0]) + x = self.conv1(x) + return x diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp index e53c25192c..c9d0c60801 100644 --- a/examples/BuddyTest/test-main.cpp +++ b/examples/BuddyTest/test-main.cpp @@ -25,31 +25,63 @@ using namespace buddy; extern "C" void -_mlir_ciface_forward(MemRef *result, MemRef *input); +_mlir_ciface_forward(MemRef *result, MemRef *filter, MemRef *bias, MemRef *input); int main() { /// Initialize data containers. - MemRef input({1, 1, 28, 28}); - MemRef result({28, 28}); + const int N = 1; + const int C = 6; + const int K = 1; + const int kernel_size = 5; + const int H = 32; + const int W = 32; + const int H_out = H - kernel_size + 1; + const int W_out = W - kernel_size + 1; - for (int i = 0; i < 28; i++) { - for (int j = 0; j < 28; j++) { - int index = i * 28 + j; - input[index] = static_cast(index); + MemRef input({N, C, H, W}); + MemRef filter({K, C, kernel_size, kernel_size}); + MemRef bias({K}); + MemRef result({N, K, H_out, W_out}); + + // Initial the input data + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + int index = n * C * H * W + c * H * W + i * W + j; + input[index] = static_cast(1); + } + } } } - // Print the generated data to verify - for (int i = 0; i < 28; i++) { - for (int j = 0; j < 28; j++) { - std::cout << input[i * 28 + j] << " "; + for (int k = 0; k < K; k++) { + for (int c = 0; c < C; c++) { + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < kernel_size; j++) { + int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j; + filter[index] = static_cast(1); + } + } } - std::cout << std::endl; } + + for (int k = 0; k < K; k++) { + bias[k] = 1; + } + + // Print the generated data to verify + + // for (int i = 0; i < H; i++) { + // for (int j = 0; j < W; j++) { + // std::cout << input[i * W + j] << " "; + // } + // std::cout << std::endl; + // } const auto inferenceStart = std::chrono::high_resolution_clock::now(); /// Execute forward inference of the model. - _mlir_ciface_forward(&result, &input); + _mlir_ciface_forward(&result, &filter, &bias, &input); const auto inferenceEnd = std::chrono::high_resolution_clock::now(); const std::chrono::duration inferenceTime = @@ -58,15 +90,15 @@ int main() { /// Print the output data for verification. std::cout << "\033[33;1m[Output] \033[0m"; std::cout << "["; - for (int i = 0; i < 28; i++) { + for (int i = 0; i < H_out; i++) { if (i > 0) std::cout << " "; std::cout << "["; - for (int j = 0; j < 28; j++) { + for (int j = 0; j < W_out; j++) { if (j > 0) std::cout << " "; - std::cout << result[i * 28 + j]; + std::cout << result[i * W_out + j]; } std::cout << "]"; - if (i < 27) std::cout << "\n "; + if (i < H_out - 1) std::cout << "\n "; } std::cout << "]" << std::endl; diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index 2eff78f97d..31654e3274 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -27,10 +27,14 @@ from ..graph import ( ReluOp, ReshapeOp, - PermuteOp + PermuteOp, + Conv2dOp, + MaxPool2dOp ) from .utils import * +TILE_WIDTH = 16 + def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): """ Import the buddy ReluOp. @@ -62,24 +66,37 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): gpu_kernel = gpu.LaunchOp( asyncToken=None, asyncDependencies=[], - gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result, - blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result, + gridSizeX=c1.result, + gridSizeY=c1.result, + gridSizeZ=c1.result, + blockSizeX=kernels.result, + blockSizeY=c1.result, + blockSizeZ=c1.result, ) gpu_kernel_block = ir.Block.create_at_start( gpu_kernel.body, [ - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_idx, block_idy, block_idz - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # thread_idx , thread_idy, thread_idz - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # grid_size x, grid_size y, grid_size z - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_size x, block_size y, block_size z + ir.IndexType.get(), # block_id x + ir.IndexType.get(), # block_id y + ir.IndexType.get(), # block_id z + ir.IndexType.get(), # thread_id x + ir.IndexType.get(), # thread_id y + ir.IndexType.get(), # thread_id z + ir.IndexType.get(), # grid_size x + ir.IndexType.get(), # grid_size y + ir.IndexType.get(), # grid_size z + ir.IndexType.get(), # block_size x + ir.IndexType.get(), # block_size y + ir.IndexType.get(), # block_size z ] ) with ir.InsertionPoint(gpu_kernel_block): - tIdX = gpu_kernel_block.arguments[3] - cst_0 = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0)) + thread_local_idx = gpu_kernel_block.arguments[3] + element_attr = mlir_element_attr_get(dtype, 0.0) + cst_0 = arith.ConstantOp(element_type, element_attr) loop = scf.ForOp( - lower_bound=tIdX, + lower_bound=thread_local_idx, upper_bound=size, step=gpu_kernel.blockSizeX ) @@ -94,8 +111,8 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): memref.CopyOp(input, output) return output -# TODO: Implement Reshape Operation on GPU in future revisions. +# TODO: Implement Reshape Operation on GPU in future revisions. def reshape_op(node: ReshapeOp, symbol_table): """ Import the reshape operation. @@ -141,8 +158,8 @@ def reshape_op(node: ReshapeOp, symbol_table): return op -# TODO: Implement Permute Operation on GPU in future revisions. +# TODO: Implement Permute Operation on GPU in future revisions. def permute_op(node: PermuteOp, symbol_table): """ Import the permute operation. @@ -174,8 +191,205 @@ def permute_op(node: PermuteOp, symbol_table): ) return permute_op + +# TODO: Consider the cases where the arguments take different values. +def convolution2d_op(node: Conv2dOp, symbol_table): + """ + Import the convolution operation. + From Buddy Conv2dOp to MLIR GPU `conv2d` kernel. + arg[0]: Tensor input + arg[1]: Tensor weight + arg[2]: Tensor? bias + arg[3]: SymInt[] stride + arg[4]: SymInt[] padding + arg[5]: SymInt[] dilation + arg[6]: bool transposed + arg[7]: SymInt[] output_padding + arg[8]: SymInt groups + """ + # Get arguments from convolution node. + assert len(node.args) == 9 + input = node.args[0] + filter = node.args[1] + bias = node.args[2] + stride = node.args[3] + input_padding = node.args[4] + dilation = node.args[5] + is_kernel_transposed = node.args[6] + out_padding = node.args[7] + groups = node.args[8] + + # TODO: Consider the cases where the variables take different values. + assert input_padding[0] == input_padding[1] == 0 + assert dilation[0] == dilation[1] == 1 + assert is_kernel_transposed == False + assert out_padding[0] == out_padding[1] == 0 + assert groups == 1 + + # Prepare input, filter, and output information. + input_val = symbol_table.get((str(input), 0)) + input_shape = list(ir.MemRefType(input_val.type).shape) + filter_val = symbol_table.get((str(filter), 0)) + filter_shape = ir.MemRefType(filter_val.type).shape + bias_val = symbol_table.get((str(bias), 0)) + dtype = node.tensor_meta["dtype"] + element_type = mlir_element_type_get(dtype) + output_shape = list(node.tensor_meta["shape"]) + + batch_size = input_shape[0] + in_channels = input_shape[1] + out_channels = output_shape[0] + H_in = input_shape[2] + W_in = input_shape[3] + H_out = output_shape[2] + W_out = output_shape[3] + H_filter = filter_shape[2] + W_filter = filter_shape[3] + + output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) + input_cast = memref.CastOp(unranked_memref_type, input_val) + filter_cast = memref.CastOp(unranked_memref_type, filter_val) + output_cast = memref.CastOp(unranked_memref_type, output_val) + + gpu.HostRegisterOp(input_cast) + gpu.HostRegisterOp(filter_cast) + gpu.HostRegisterOp(output_cast) + + # Tile the input_val into Grids + block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH) + batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) + in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) + out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels)) + block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z)) + tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH)) + H_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_filter)) + W_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_filter)) + c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) + c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) + + # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1) numBlocks(N, K, block_z) + + gpu_kernel = gpu.LaunchOp( + asyncToken=None, + asyncDependencies=[], + gridSizeX=batch_size_val.result, + gridSizeY=out_channels_val.result, + gridSizeZ=block_z_val.result, + blockSizeX=tile_width_val.result, + blockSizeY=tile_width_val.result, + blockSizeZ=c1.result, + ) + + gpu_kernel_block = ir.Block.create_at_start( + gpu_kernel.body, + [ + ir.IndexType.get(), # block_id x + ir.IndexType.get(), # block_id y + ir.IndexType.get(), # block_id z + ir.IndexType.get(), # thread_id x + ir.IndexType.get(), # thread_id y + ir.IndexType.get(), # thread_id z + ir.IndexType.get(), # grid_size x + ir.IndexType.get(), # grid_size y + ir.IndexType.get(), # grid_size z + ir.IndexType.get(), # block_size x + ir.IndexType.get(), # block_size y + ir.IndexType.get(), # block_size z + ] + ) + + with ir.InsertionPoint(gpu_kernel_block): + batch_id = gpu_kernel_block.arguments[0] + out_channel_id = gpu_kernel_block.arguments[1] + tile_id = gpu_kernel_block.arguments[2] + thread_local_idx = gpu_kernel_block.arguments[3] + thread_local_idy = gpu_kernel_block.arguments[4] + + # Calculate the convolution element at (h, w) for this thread + tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH + tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) + + t0 = arith.divui(tile_id, tile_num_val) + t1 = arith.muli(t0, tile_width_val) + thread_global_idx = arith.addi(t1, thread_local_idx) + + t2 = arith.remui(tile_id, tile_num_val) + t3 = arith.muli(t2, tile_width_val) + thread_global_idy = arith.addi(t3, thread_local_idy) + + stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0])) + stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1])) + t4 = arith.muli(thread_global_idx, stride_h) + t5 = arith.muli(thread_global_idy, stride_w) + + # Check if the (h, w) is out of the output bounds + ult = 6 + H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out)) + W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out)) + isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val) + isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val) + isInBounds = arith.andi(isHInBounds, isWInBounds) + + cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0)) + branch0 = scf.IfOp(isInBounds) + with ir.InsertionPoint(branch0.then_block): + loop0 = scf.ForOp( + lower_bound=c0.result, + upper_bound=in_channels_val.result, + step=c1.result, + iter_args=[cst_0.result] + ) + with ir.InsertionPoint(loop0.body): + loop1 = scf.ForOp( + lower_bound=c0.result, + upper_bound=H_filter_val.result, + step=c1.result, + iter_args=[cst_0.result] + ) + with ir.InsertionPoint(loop1.body): + loop2 = scf.ForOp( + lower_bound=c0.result, + upper_bound=W_filter_val.result, + step=c1.result, + iter_args=[cst_0.result] + ) + with ir.InsertionPoint(loop2.body): + # TODO : loop body + in_channel_id = loop0.body.arguments[0] + filter_ele_idx = loop1.body.arguments[0] + filter_ele_idy = loop2.body.arguments[0] + input_ele_idx = arith.addi(t4, filter_ele_idx) + input_ele_idy = arith.addi(t5, filter_ele_idy) + input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy]) + filter_ele = memref.LoadOp(filter_val, [out_channel_id, in_channel_id, filter_ele_idx, filter_ele_idy]) + t6 = arith.mulf(input_ele, filter_ele) + iter_arg2 = loop2.body.arguments[1] + iter_res2 = arith.addf(iter_arg2, t6) + scf.YieldOp([iter_res2]) + + iter_arg1 = loop1.body.arguments[1] + iter_res1 = arith.addf(loop2, iter_arg1) + scf.YieldOp([iter_res1]) + + iter_arg0 = loop0.body.arguments[1] + iter_res0 = arith.addf(loop1, iter_arg0) + scf.YieldOp([iter_res0]) + + # Add bias data for any out_channel. + bias_ele = memref.LoadOp(bias_val, [out_channel_id]) + result = arith.addf(loop0, bias_ele) + memref.StoreOp(result, output_val, [batch_id, out_channel_id, thread_global_idx, thread_global_idy]) + scf.YieldOp([]) + + gpu.TerminatorOp() + + return output_val + + ops_registry = { "ReluOp": relu_op, "ViewOp": reshape_op, - "PermuteOp": permute_op + "PermuteOp": permute_op, + "Conv2dOp": convolution2d_op, } From 0adf1df8769e245538e5fbc713f8ea68c96c8d56 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Mon, 2 Sep 2024 13:48:10 +0000 Subject: [PATCH 14/29] [frontend] Add GPU MLIR lowering path with MaxPool2d operation support --- examples/BuddyTest/import-test.py | 2 +- examples/BuddyTest/model.py | 4 +- examples/BuddyTest/test-main.cpp | 50 +++++----- frontend/Python/ops/gpu.py | 150 ++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 25 deletions(-) diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py index e2e863a66c..5636ba5e07 100644 --- a/examples/BuddyTest/import-test.py +++ b/examples/BuddyTest/import-test.py @@ -40,7 +40,7 @@ aot_autograd_decomposition=inductor_decomp, ) -data = torch.randn([1, 6, 32, 32]) +data = torch.randn([1, 1, 32, 32]) # Import the model into MLIR module and parameters. with torch.no_grad(): graphs = dynamo_compiler.importer(model, data) diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py index 0439ed70f4..a6b6e7d71b 100644 --- a/examples/BuddyTest/model.py +++ b/examples/BuddyTest/model.py @@ -24,8 +24,10 @@ class TestModule(nn.Module): def __init__(self): super(TestModule, self).__init__() - self.conv1 = nn.Conv2d(6, 1, 5) + self.conv1 = nn.Conv2d(1, 6, 5) + self.pool = nn.MaxPool2d(2, 2) def forward(self, x): x = self.conv1(x) + x = self.pool(x) return x diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp index c9d0c60801..d1764bccd2 100644 --- a/examples/BuddyTest/test-main.cpp +++ b/examples/BuddyTest/test-main.cpp @@ -24,24 +24,28 @@ using namespace buddy; +// extern "C" void +// _mlir_ciface_forward(MemRef *result, MemRef *filter, MemRef *bias, MemRef *input); + extern "C" void -_mlir_ciface_forward(MemRef *result, MemRef *filter, MemRef *bias, MemRef *input); +_mlir_ciface_forward(MemRef *result, MemRef *input); int main() { /// Initialize data containers. const int N = 1; - const int C = 6; + const int C = 1; const int K = 1; - const int kernel_size = 5; + const int kernel_size = 2; + const int stride = 2; const int H = 32; const int W = 32; - const int H_out = H - kernel_size + 1; - const int W_out = W - kernel_size + 1; + const int H_out = H / kernel_size; + const int W_out = W / kernel_size; MemRef input({N, C, H, W}); - MemRef filter({K, C, kernel_size, kernel_size}); - MemRef bias({K}); - MemRef result({N, K, H_out, W_out}); + // MemRef filter({K, C, kernel_size, kernel_size}); + // MemRef bias({K}); + MemRef result({N, C, H_out, W_out}); // Initial the input data for (int n = 0; n < N; n++) { @@ -49,25 +53,25 @@ int main() { for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { int index = n * C * H * W + c * H * W + i * W + j; - input[index] = static_cast(1); - } - } - } - } - for (int k = 0; k < K; k++) { - for (int c = 0; c < C; c++) { - for (int i = 0; i < kernel_size; i++) { - for (int j = 0; j < kernel_size; j++) { - int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j; - filter[index] = static_cast(1); + input[index] = static_cast((float)index/(H*W)); } } } } + // for (int k = 0; k < K; k++) { + // for (int c = 0; c < C; c++) { + // for (int i = 0; i < kernel_size; i++) { + // for (int j = 0; j < kernel_size; j++) { + // int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j; + // filter[index] = static_cast(1); + // } + // } + // } + // } - for (int k = 0; k < K; k++) { - bias[k] = 1; - } + // for (int k = 0; k < K; k++) { + // bias[k] = 1; + // } // Print the generated data to verify @@ -81,7 +85,7 @@ int main() { const auto inferenceStart = std::chrono::high_resolution_clock::now(); /// Execute forward inference of the model. - _mlir_ciface_forward(&result, &filter, &bias, &input); + _mlir_ciface_forward(&result, &input); const auto inferenceEnd = std::chrono::high_resolution_clock::now(); const std::chrono::duration inferenceTime = diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index 31654e3274..c99f074ec5 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -387,9 +387,159 @@ def convolution2d_op(node: Conv2dOp, symbol_table): return output_val +# TODO: Consider the cases where the maxpool2d operation needs padding. +def maxpool2d_op(node: MaxPool2dOp, symbol_table): + """ + Import the maxpool2d operation. + From Buddy MaxPool2dOp to MLIR GPU `max_pool2d` kernel. + """ + if len(node.args) == 5: + raise NotImplementedError + input1 = node.args[0] + kernel = node.args[1] + stride = node.args[2] + + # Prepare padding data + if len(node.args) > 3: + pad = node.args[3] + else: + pad = [0 for _ in kernel] + + dtype = node.tensor_meta["dtype"] + element_type = mlir_element_type_get(dtype) + output_shape = node.tensor_meta["shape"] + + batch_size = output_shape[0] + in_channels = output_shape[1] + H_out = output_shape[2] + W_out = output_shape[3] + + input_val = symbol_table.get((str(input1), 0)) + output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) + input_cast = memref.CastOp(unranked_memref_type, input_val) + output_cast = memref.CastOp(unranked_memref_type, output_val) + + gpu.HostRegisterOp(input_cast) + gpu.HostRegisterOp(output_cast) + + # Tile the input_val into Grids + block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH) + batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) + in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) + block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z)) + tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH)) + c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) + c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) + + # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1) numBlocks(N, K, block_z) + + gpu_kernel = gpu.LaunchOp( + asyncToken=None, + asyncDependencies=[], + gridSizeX=batch_size_val.result, + gridSizeY=in_channels_val.result, + gridSizeZ=block_z_val.result, + blockSizeX=tile_width_val.result, + blockSizeY=tile_width_val.result, + blockSizeZ=c1.result, + ) + + gpu_kernel_block = ir.Block.create_at_start( + gpu_kernel.body, + [ + ir.IndexType.get(), # block_id x + ir.IndexType.get(), # block_id y + ir.IndexType.get(), # block_id z + ir.IndexType.get(), # thread_id x + ir.IndexType.get(), # thread_id y + ir.IndexType.get(), # thread_id z + ir.IndexType.get(), # grid_size x + ir.IndexType.get(), # grid_size y + ir.IndexType.get(), # grid_size z + ir.IndexType.get(), # block_size x + ir.IndexType.get(), # block_size y + ir.IndexType.get(), # block_size z + ] + ) + + with ir.InsertionPoint(gpu_kernel_block): + batch_id = gpu_kernel_block.arguments[0] + in_channel_id = gpu_kernel_block.arguments[1] + tile_id = gpu_kernel_block.arguments[2] + thread_local_idx = gpu_kernel_block.arguments[3] + thread_local_idy = gpu_kernel_block.arguments[4] + + # Calculate the convolution element at (h, w) for this thread + tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH + tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) + + t0 = arith.divui(tile_id, tile_num_val) + t1 = arith.muli(t0, tile_width_val) + thread_global_idx = arith.addi(t1, thread_local_idx) + + t2 = arith.remui(tile_id, tile_num_val) + t3 = arith.muli(t2, tile_width_val) + thread_global_idy = arith.addi(t3, thread_local_idy) + + kernel_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0])) + kernel_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1])) + stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0])) + stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1])) + t4 = arith.muli(thread_global_idx, stride_h) + t5 = arith.muli(thread_global_idy, stride_w) + + first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, t4, t5]) + + # Check if the (h, w) is out of the output bounds + ult = 6 + H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out)) + W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out)) + isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val) + isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val) + isInBounds = arith.andi(isHInBounds, isWInBounds) + + branch0 = scf.IfOp(isInBounds) + with ir.InsertionPoint(branch0.then_block): + loop0 = scf.ForOp( + lower_bound=c0.result, + upper_bound=kernel_h.result, + step=c1.result, + iter_args=[first_ele.result] + ) + with ir.InsertionPoint(loop0.body): + loop1 = scf.ForOp( + lower_bound=c0.result, + upper_bound=kernel_w.result, + step=c1.result, + iter_args=[first_ele.result] + ) + with ir.InsertionPoint(loop1.body): + # TODO : loop body + kernel_ele_idx = loop0.body.arguments[0] + kernel_ele_idy = loop1.body.arguments[0] + input_ele_idx = arith.addi(t4, kernel_ele_idx) + input_ele_idy = arith.addi(t5, kernel_ele_idy) + input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy]) + iter_arg1 = loop1.body.arguments[1] + iter_res1 = arith.maxnumf(iter_arg1, input_ele) + scf.YieldOp([iter_res1]) + + iter_arg0 = loop0.body.arguments[1] + iter_res0 = arith.maxnumf(loop1, iter_arg0) + scf.YieldOp([iter_res0]) + + memref.StoreOp(loop0, output_val, [batch_id, in_channel_id, thread_global_idx, thread_global_idy]) + scf.YieldOp([]) + + gpu.TerminatorOp() + + return output_val + ops_registry = { "ReluOp": relu_op, "ViewOp": reshape_op, "PermuteOp": permute_op, "Conv2dOp": convolution2d_op, + "MaxPool2dOp": maxpool2d_op } From f63634141a5fd8b9ebf0ed4a44fcc9dbf433a7d2 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Tue, 3 Sep 2024 02:12:11 +0000 Subject: [PATCH 15/29] [frontend] Fix Permute Op --- examples/BuddyTest/import-test.py | 3 +- examples/BuddyTest/model.py | 8 +++- frontend/Python/ops/gpu.py | 72 +++++++++++++++++-------------- frontend/Python/ops/utils.py | 31 ------------- 4 files changed, 48 insertions(+), 66 deletions(-) diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py index 5636ba5e07..79620d9d44 100644 --- a/examples/BuddyTest/import-test.py +++ b/examples/BuddyTest/import-test.py @@ -40,13 +40,14 @@ aot_autograd_decomposition=inductor_decomp, ) -data = torch.randn([1, 1, 32, 32]) +data = torch.randn([1, 1, 12, 10]) # Import the model into MLIR module and parameters. with torch.no_grad(): graphs = dynamo_compiler.importer(model, data) assert len(graphs) == 1 graph = graphs[0] +print(graph.body) graph.lower_to_top_level_ir() path_prefix = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py index a6b6e7d71b..d72af61c95 100644 --- a/examples/BuddyTest/model.py +++ b/examples/BuddyTest/model.py @@ -26,8 +26,12 @@ def __init__(self): super(TestModule, self).__init__() self.conv1 = nn.Conv2d(1, 6, 5) self.pool = nn.MaxPool2d(2, 2) + self.fc1 = nn.Linear(120,84) def forward(self, x): - x = self.conv1(x) - x = self.pool(x) + # x = self.conv1(x) + # x = self.pool(x) + x = x.view(-1, 120) + x = self.fc1(x) return x + diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index c99f074ec5..042770a8b5 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -176,9 +176,15 @@ def permute_op(node: PermuteOp, symbol_table): for i, p in enumerate(perm): input_shape[p] = output_shape[i] + # Prepare input_stride and output_stride data + input_stride = [] + stride = 1 + for dim in reversed(input_shape): + input_stride.insert(0, stride) + stride *= dim + output_stride = [input_stride[i] for i in perm] + offset = 0 - input_stride = generate_strides(input_shape) - output_stride = transpose_strides(input_stride, perm) result_type = ir.MemRefType.get( shape=output_shape, element_type=element_type, @@ -188,8 +194,10 @@ def permute_op(node: PermuteOp, symbol_table): result=result_type, in_=input1, permutation=perm_attr - ) - return permute_op + ) + output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + memref.CopyOp(permute_op, output) + return output # TODO: Consider the cases where the arguments take different values. @@ -239,10 +247,10 @@ def convolution2d_op(node: Conv2dOp, symbol_table): batch_size = input_shape[0] in_channels = input_shape[1] out_channels = output_shape[0] - H_in = input_shape[2] - W_in = input_shape[3] - H_out = output_shape[2] - W_out = output_shape[3] + in_size_h = input_shape[2] + in_size_w = input_shape[3] + out_size_h = output_shape[2] + out_size_w = output_shape[3] H_filter = filter_shape[2] W_filter = filter_shape[3] @@ -257,7 +265,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table): gpu.HostRegisterOp(output_cast) # Tile the input_val into Grids - block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH) + block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH) batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels)) @@ -307,7 +315,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table): thread_local_idy = gpu_kernel_block.arguments[4] # Calculate the convolution element at (h, w) for this thread - tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH + tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) t0 = arith.divui(tile_id, tile_num_val) @@ -325,10 +333,10 @@ def convolution2d_op(node: Conv2dOp, symbol_table): # Check if the (h, w) is out of the output bounds ult = 6 - H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out)) - W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out)) - isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val) - isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val) + out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h)) + out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w)) + isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val) + isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val) isInBounds = arith.andi(isHInBounds, isWInBounds) cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0)) @@ -411,8 +419,8 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): batch_size = output_shape[0] in_channels = output_shape[1] - H_out = output_shape[2] - W_out = output_shape[3] + out_size_h = output_shape[2] + out_size_w = output_shape[3] input_val = symbol_table.get((str(input1), 0)) output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) @@ -424,7 +432,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): gpu.HostRegisterOp(output_cast) # Tile the input_val into Grids - block_z = ((H_out + TILE_WIDTH - 1) // TILE_WIDTH) * ((W_out + TILE_WIDTH - 1) // TILE_WIDTH) + block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH) batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z)) @@ -471,7 +479,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): thread_local_idy = gpu_kernel_block.arguments[4] # Calculate the convolution element at (h, w) for this thread - tile_num = (W_out + TILE_WIDTH - 1) // TILE_WIDTH + tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) t0 = arith.divui(tile_id, tile_num_val) @@ -482,35 +490,34 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): t3 = arith.muli(t2, tile_width_val) thread_global_idy = arith.addi(t3, thread_local_idy) - kernel_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0])) - kernel_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1])) + kernel_size_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0])) + kernel_size_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1])) stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0])) stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1])) - t4 = arith.muli(thread_global_idx, stride_h) - t5 = arith.muli(thread_global_idy, stride_w) - - first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, t4, t5]) + init_ele_idx = arith.muli(thread_global_idx, stride_h) + init_ele_idy = arith.muli(thread_global_idy, stride_w) # Check if the (h, w) is out of the output bounds ult = 6 - H_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_out)) - W_out_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_out)) - isHInBounds = arith.cmpi(ult, thread_global_idx, H_out_val) - isWInBounds = arith.cmpi(ult, thread_global_idy, W_out_val) + out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h)) + out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w)) + isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val) + isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val) isInBounds = arith.andi(isHInBounds, isWInBounds) branch0 = scf.IfOp(isInBounds) with ir.InsertionPoint(branch0.then_block): + first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, init_ele_idx, init_ele_idy]) loop0 = scf.ForOp( lower_bound=c0.result, - upper_bound=kernel_h.result, + upper_bound=kernel_size_h.result, step=c1.result, iter_args=[first_ele.result] ) with ir.InsertionPoint(loop0.body): loop1 = scf.ForOp( lower_bound=c0.result, - upper_bound=kernel_w.result, + upper_bound=kernel_size_w.result, step=c1.result, iter_args=[first_ele.result] ) @@ -518,8 +525,8 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): # TODO : loop body kernel_ele_idx = loop0.body.arguments[0] kernel_ele_idy = loop1.body.arguments[0] - input_ele_idx = arith.addi(t4, kernel_ele_idx) - input_ele_idy = arith.addi(t5, kernel_ele_idy) + input_ele_idx = arith.addi(init_ele_idx, kernel_ele_idx) + input_ele_idy = arith.addi(init_ele_idy, kernel_ele_idy) input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy]) iter_arg1 = loop1.body.arguments[1] iter_res1 = arith.maxnumf(iter_arg1, input_ele) @@ -536,6 +543,7 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): return output_val + ops_registry = { "ReluOp": relu_op, "ViewOp": reshape_op, diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py index 2b2dfe4ca2..012340d475 100644 --- a/frontend/Python/ops/utils.py +++ b/frontend/Python/ops/utils.py @@ -66,34 +66,3 @@ def tensor_shape_size(shape): for dim in shape: size *= dim return size - -def generate_strides(shape): - """ - Generate strides based on the input matrix shape. - - Args: - shape (list[int]): The shape of the input matrix, e.g., [2, 3, 4]. - - Returns: - list[int]: The corresponding strides, e.g., [12, 4, 1]. - """ - strides = [] - stride = 1 - for dim in reversed(shape): - strides.insert(0, stride) - stride *= dim - return strides - -def transpose_strides(strides, permutation): - """ - Reorder strides based on the input permutation. - - Args: - strides (list[int]): The original strides list, e.g., [12, 4, 1]. - permutation (list[int]): The permutation order, e.g., [1, 2, 0]. - - Returns: - list[int]: The reordered strides list, e.g., [4, 1, 12]. - """ - transposed_strides = [strides[i] for i in permutation] - return transposed_strides From 72cdc8269855e7b0baff3b22c280b7bdade9584b Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Tue, 10 Sep 2024 07:29:02 +0000 Subject: [PATCH 16/29] [frontend] Fix implementation error in permute and conv_2d operation --- examples/BuddyTest/CMakeLists.txt | 4 +- examples/BuddyTest/makefile | 20 +++++----- frontend/Python/ops/gpu.py | 61 ++++++++++++++++--------------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt index 2e3654b347..8039bfcc15 100644 --- a/examples/BuddyTest/CMakeLists.txt +++ b/examples/BuddyTest/CMakeLists.txt @@ -7,9 +7,9 @@ add_custom_command( add_custom_command( OUTPUT forward.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | + COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | + -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | ${LLVM_MLIR_BINARY_DIR}/llvm-as | ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile index 02aba04064..9c4c2e4a0c 100644 --- a/examples/BuddyTest/makefile +++ b/examples/BuddyTest/makefile @@ -20,27 +20,27 @@ MTRIPLE := x86_64-apple-darwin endif gpu-test-lower: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ ${MLIR_OPT} -o log.mlir gpu-test-translate: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll gpu-test-run: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} gpu-conv2d-lower: - @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ ${MLIR_OPT} -o log.mlir gpu-conv2d-translate: @@ -50,7 +50,7 @@ gpu-conv2d-translate: ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll gpu-conv2d-run: - @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ + @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ + gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index 042770a8b5..fcf6d1f398 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -21,7 +21,7 @@ from typing import Tuple import mlir.ir as ir -from mlir.dialects import gpu, memref, arith, scf +from mlir.dialects import gpu, memref, arith, scf, vector from ..graph import TensorDType from ..graph import ( @@ -167,37 +167,38 @@ def permute_op(node: PermuteOp, symbol_table): operation. """ input1 = symbol_table.get((str(node.args[0]), 0)) - perm = node.args[1] - perm_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm)) + perm_map = node.args[1] + perm_map_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm_map)) output_shape = list(node.tensor_meta["shape"]) - element_type = mlir_element_type_get(node.tensor_meta["dtype"]) - input_shape = [0] * len(output_shape) - for i, p in enumerate(perm): - input_shape[p] = output_shape[i] - - # Prepare input_stride and output_stride data - input_stride = [] - stride = 1 - for dim in reversed(input_shape): - input_stride.insert(0, stride) - stride *= dim - output_stride = [input_stride[i] for i in perm] - - offset = 0 - result_type = ir.MemRefType.get( - shape=output_shape, - element_type=element_type, - layout=ir.StridedLayoutAttr.get(offset, output_stride) + dtype = node.tensor_meta["dtype"] + + element_type = mlir_element_type_get(dtype) + element_attr = mlir_element_attr_get(dtype, 0.0) + + c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) + f0 = arith.ConstantOp(element_type, element_attr) + + v0 = vector.transfer_read( + vector=ir.VectorType.get(output_shape, element_type), + source=input1, + indices=[c0]*len(output_shape), + permutation_map=perm_map_attr, + padding=f0 ) - permute_op = memref.TransposeOp( - result=result_type, - in_=input1, - permutation=perm_attr - ) - output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - memref.CopyOp(permute_op, output) - return output + + transpose = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + + vector.transfer_write( + result=None, + vector=v0, + source=transpose, + indices=[c0]*len(output_shape), + permutation_map=ir.AffineMapAttr.get( + ir.AffineMap.get_permutation([i for i in range(len(output_shape))]) + ) + ) + return transpose # TODO: Consider the cases where the arguments take different values. @@ -246,7 +247,7 @@ def convolution2d_op(node: Conv2dOp, symbol_table): batch_size = input_shape[0] in_channels = input_shape[1] - out_channels = output_shape[0] + out_channels = output_shape[1] in_size_h = input_shape[2] in_size_w = input_shape[3] out_size_h = output_shape[2] From cf703c7246b0d6a685de871d833c0f1b41232636 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Wed, 18 Sep 2024 13:39:14 +0000 Subject: [PATCH 17/29] [frontend] Add LeNet example for E2E execution in GPU device --- examples/BuddyLeNet/CMakeLists.txt | 73 +++++--- examples/BuddyLeNet/buddy-lenet-import.py | 4 +- frontend/Python/graph/graph.py | 28 ++- frontend/Python/graph/graph_driver.py | 7 +- frontend/Python/graph/transform/fuse_ops.py | 2 +- frontend/Python/ops/gpu.py | 179 +++++++++++++++++++- frontend/Python/ops/tosa.py | 108 ++++++++++-- 7 files changed, 349 insertions(+), 52 deletions(-) diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index 9698f617bc..c10571f883 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -18,37 +18,54 @@ add_custom_command( VERBATIM) add_custom_command( - OUTPUT subgraph0.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir - -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | - ${BUDDY_BINARY_DIR}/buddy-opt - -eliminate-empty-tensors - -convert-tensor-to-linalg - -linalg-bufferize - -convert-linalg-to-affine-loops - -lower-affine - -func-bufferize-dynamic-offset - -arith-bufferize - -tensor-bufferize - -buffer-deallocation - -finalizing-bufferize - -convert-vector-to-scf - -expand-strided-metadata - -convert-vector-to-llvm - -convert-arith-to-llvm - -finalize-memref-to-llvm - -convert-scf-to-cf - -llvm-request-c-wrappers - -convert-arith-to-llvm - -convert-func-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llvm-as | - ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o + OUTPUT subgraph0.ll + COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir + COMMENT "Building subgraph0.ll" + VERBATIM) + +add_custom_command( + OUTPUT subgraph0.o + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o + DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll COMMENT "Building subgraph0.o" VERBATIM) +# add_custom_command( +# OUTPUT subgraph0.o +# COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | +# ${BUDDY_BINARY_DIR}/buddy-opt +# -eliminate-empty-tensors +# -convert-tensor-to-linalg +# -linalg-bufferize +# -convert-linalg-to-affine-loops +# -lower-affine +# -func-bufferize-dynamic-offset +# -arith-bufferize +# -tensor-bufferize +# -buffer-deallocation +# -finalizing-bufferize +# -convert-vector-to-scf +# -expand-strided-metadata +# -convert-vector-to-llvm +# -convert-arith-to-llvm +# -finalize-memref-to-llvm +# -convert-scf-to-cf +# -llvm-request-c-wrappers +# -convert-arith-to-llvm +# -convert-func-to-llvm +# -reconcile-unrealized-casts | +# ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | +# ${LLVM_MLIR_BINARY_DIR}/llvm-as | +# ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o +# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# COMMENT "Building subgraph0.o" +# VERBATIM) + add_library(LENET STATIC subgraph0.o forward.o) SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) @@ -56,5 +73,5 @@ SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) add_executable(buddy-lenet-run buddy-lenet-main.cpp) target_link_directories(buddy-lenet-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) -set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS}) +set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime ${OpenCV_LIBS}) target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS}) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 95e76de253..65ef5127f3 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -28,7 +28,7 @@ from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.graph import GraphDriver from buddy.compiler.graph.transform import simply_fuse -from buddy.compiler.ops import tosa +from buddy.compiler.ops import tosa, gpu from model import LeNet # Retrieve the LeNet model path from environment variables. @@ -44,7 +44,7 @@ # Initialize Dynamo Compiler with specific configurations as an importer. dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, + primary_registry=gpu.ops_registry, aot_autograd_decomposition=inductor_decomp, ) diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 7c99b4391d..3ca8aa298a 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -26,6 +26,7 @@ import mlir.ir as ir import mlir.dialects.func as func +import mlir.dialects.bufferization as buffer from mlir.passmanager import * from mlir.execution_engine import * from mlir import runtime as rt @@ -105,6 +106,7 @@ def __init__( fake_params: List[TensorMeta], ops_registry: dict, func_name: str, + device: DeviceType = DeviceType.GPU ) -> None: """ Initializes the Graph. @@ -123,7 +125,7 @@ def __init__( self._inputs = inputs self.node_table: Dict[str, Op] = {} self._fake_params = fake_params - self.device = "cpu" + self.device = device self._imported_module = None self._ops_registry = ops_registry self._func_name = func_name @@ -174,7 +176,7 @@ def init_op_group(self): continue group = [op] subgraph_name = "subgraph{}".format(i) - self.group_map_device[subgraph_name] = DeviceType.UNKNOW + self.group_map_device[subgraph_name] = DeviceType.GPU self.op_groups[subgraph_name] = group def fuse_ops(self, pattern_list: List[FunctionType]): @@ -237,6 +239,8 @@ def lower_to_top_level_ir(self): self._inputs, self._func_name, self._ops_registry, + False, + self.device ) self._imported_module = fx_importer.import_graph() outputs = fx_importer.get_output_nodes() @@ -347,6 +351,7 @@ def __init__( func_name: str, ops_registry: dict, do_param_pack: bool = False, + device: DeviceType = DeviceType.CPU, ): """ Initializes the buddy Graph importer. @@ -361,7 +366,7 @@ def __init__( ops_registry = {} self._symbol_table = {} self._body = body - self._device = DeviceType.GPU + self._device = device self._func_name = func_name self._params = params self._inputs = inputs @@ -441,7 +446,7 @@ def import_graph(self) -> ir.Module: shape_list = list(arg.shape) dtype = arg.dtype mlir_dtype = self._str_to_mlir_dtype(dtype) - tensor_arg = ir.MemRefType.get(shape_list, mlir_dtype) + tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype) arguments.append(tensor_arg) extern_func = [] for node in self._body: @@ -461,6 +466,11 @@ def generated_func(*args): self._symbol_table.get((str(output_arg), 0)) for output_arg in output_node_args ] + # if self._device == DeviceType.GPU: + # returns = [ + # buffer.to_tensor(ret) + # for ret in returns + # ] self._symbol_table[("output", 0)] = returns elif isinstance(node, PlaceholderOp): self._import_placeholder(node, args_list) @@ -577,6 +587,16 @@ def _import_placeholder( else: placeholder_name = args_list[self._num_input_visited] + # TODO : Consider converting arg type from RankedTensorType to MemRefType + if self._device == DeviceType.GPU: + placeholder_name = buffer.to_memref( + ir.MemRefType.get( + list(node.tensor_meta.shape), + self._str_to_mlir_dtype(node.tensor_meta.dtype) + ), + placeholder_name + ) + self._symbol_table[(str(node.name), 0)] = placeholder_name self._num_input_visited += 1 diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py index 50a8869d5a..62a1239859 100644 --- a/frontend/Python/graph/graph_driver.py +++ b/frontend/Python/graph/graph_driver.py @@ -112,6 +112,7 @@ def build_subgraph_by_group(self): for subgraph_name in self._graph.op_groups.keys(): subgraph_input = [] subgraph_body = [] + subgraph_device = self._graph.group_map_device[subgraph_name] # Construct input placeholder nodes for inp in subgraphs_inputs[subgraph_name]: @@ -142,7 +143,11 @@ def build_subgraph_by_group(self): # Create subgraph and add it to the dictionary subgraph = Graph( - subgraph_input, [], self._graph._ops_registry, subgraph_name + subgraph_input, + [], + self._graph._ops_registry, + subgraph_name, + subgraph_device ) subgraph.body = subgraph_body for op in subgraph_body: diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py index 61f6a5b54a..b3192653ad 100644 --- a/frontend/Python/graph/transform/fuse_ops.py +++ b/frontend/Python/graph/transform/fuse_ops.py @@ -39,7 +39,7 @@ def simply_fuse(graph: Graph): - None: Modifies the input graph in place. """ new_op_group = [] - device = DeviceType.UNKNOW + device = DeviceType.GPU for op in graph.body: if isinstance(op, PlaceholderOp): continue diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py index fcf6d1f398..9c8a5265e3 100644 --- a/frontend/Python/ops/gpu.py +++ b/frontend/Python/ops/gpu.py @@ -29,7 +29,8 @@ ReshapeOp, PermuteOp, Conv2dOp, - MaxPool2dOp + MaxPool2dOp, + AddMMOp ) from .utils import * @@ -107,6 +108,8 @@ def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): scf.YieldOp([]) gpu.TerminatorOp() + + gpu.HostUnregisterOp(input_cast) output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) memref.CopyOp(input, output) return output @@ -259,10 +262,12 @@ def convolution2d_op(node: Conv2dOp, symbol_table): unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) input_cast = memref.CastOp(unranked_memref_type, input_val) filter_cast = memref.CastOp(unranked_memref_type, filter_val) + bias_cast = memref.CastOp(unranked_memref_type, bias_val) output_cast = memref.CastOp(unranked_memref_type, output_val) gpu.HostRegisterOp(input_cast) gpu.HostRegisterOp(filter_cast) + gpu.HostRegisterOp(bias_cast) gpu.HostRegisterOp(output_cast) # Tile the input_val into Grids @@ -392,6 +397,11 @@ def convolution2d_op(node: Conv2dOp, symbol_table): scf.YieldOp([]) gpu.TerminatorOp() + + gpu.HostUnregisterOp(input_cast) + gpu.HostUnregisterOp(filter_cast) + gpu.HostUnregisterOp(bias_cast) + gpu.HostUnregisterOp(output_cast) return output_val @@ -542,13 +552,178 @@ def maxpool2d_op(node: MaxPool2dOp, symbol_table): gpu.TerminatorOp() + gpu.HostUnregisterOp(input_cast) + gpu.HostUnregisterOp(output_cast) + return output_val +def addmm_op( + node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation] +): + dtype = node.tensor_meta["dtype"] + element_type = mlir_element_type_get(dtype) + c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) + c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) + kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) + + # TODO: Reverse the order of the mat2 before multiplication to optimize the cache hit rate + + input_data = symbol_table.get((str(node.args[1]), 0), node.args[1]) + weight = symbol_table.get((str(node.args[2]), 0), node.args[2]) + bias = symbol_table.get((str(node.args[0]), 0), node.args[0]) + # print("input_data: "+str(input_data)) + # print("weight: "+str(weight)) + # print("bias: "+str(bias)) + + # TODO: Transpose of the mat2 before multiplication to optimize the cache hit rate + + output_shape = list(node.tensor_meta["shape"]) + input_shape = input_data.type.shape + weight_shape = weight.type.shape + # print("output_shape: "+str(output_shape)) + # print("output_shape: "+str()) + # print("input_shape: "+str(input_shape)) + # print("weight_shape: "+str(weight_shape)) + # print("bias shape: "+str(bias.type.shape)) + + # Flatten the input into a one-dimensional format + input_size = tensor_shape_size(input_shape) + weight_size = tensor_shape_size(weight_shape) + output_size = tensor_shape_size(output_shape) + # print("input_size: "+str(input_size)) + # print("weight_size: "+str(weight_size)) + # print("output_size: "+str(output_size)) + + input_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_size)) + weight_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_size)) + output_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size)) + # print("input_size_c: "+str(input_size_c)) + # print("weight_size_c: "+str(weight_size_c)) + # print("output_size_c: "+str(output_size_c)) + + input_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) + weight_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) + bias_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) + # print("input_shape_1d: "+str(input_shape_1d)) + # print("weight_shape_1d: "+str(weight_shape_1d)) + # print("bias_shape_1d: "+str(bias_shape_1d)) + + memref.StoreOp(input_size_c, input_shape_1d, [c0]) + memref.StoreOp(weight_size_c, weight_shape_1d, [c0]) + memref.StoreOp(output_size_c, bias_shape_1d, [c0]) + + input_reshape_type = ir.MemRefType.get([input_size], element_type) + weight_reshape_type = ir.MemRefType.get([weight_size], element_type) + bias_reshape_type = ir.MemRefType.get([output_size], element_type) + output_type = ir.MemRefType.get(output_shape, element_type) + # print("input_reshape_type: "+str(input_reshape_type)) + # print("weight_reshape_type: "+str(weight_reshape_type)) + # print("bias_reshape_type: "+str(bias_reshape_type)) + # print("output_type: "+str(output_type)) + + input_reshape_1d = memref.ReshapeOp(input_reshape_type, input_data, input_shape_1d) + weight_reshape_1d = memref.ReshapeOp(weight_reshape_type, weight, weight_shape_1d) + bias_reshape_1d = memref.ReshapeOp(bias_reshape_type, bias, bias_shape_1d) + # print("input_reshape: "+str(input_reshape_1d)) + # print("weight_reshape: "+str(weight_reshape_1d)) + # print("bias_reshape: "+str(bias_reshape_1d)) + + + unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) + gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, input_reshape_1d)) + gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, weight_reshape_1d)) + gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, bias_reshape_1d)) + + row = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[0])) + col = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_shape[1])) + inner_dim = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[1])) + + gpu_kernel = gpu.LaunchOp( + asyncToken=None, + asyncDependencies=[], + gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result, + blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result, + ) + gpu_kernel_block = ir.Block.create_at_start( + gpu_kernel.body, + [ + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_idx, block_idy, block_idz + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # thread_idx , thread_idy, thread_idz + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # grid_size x, grid_size y, grid_size z + ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_size x, block_size y, block_size z + ] + ) + + # TODO: optimize to one dimension + with ir.InsertionPoint(gpu_kernel_block): + tIdX = gpu_kernel_block.arguments[3] + tIdY = gpu_kernel_block.arguments[4] + otter_loop = scf.ForOp( + lower_bound=tIdX, + upper_bound=row, + step=gpu_kernel.blockSizeX + ) + with ir.InsertionPoint(otter_loop.body): + inner_loop = scf.ForOp( + lower_bound=tIdY, + upper_bound=col, + step=gpu_kernel.blockSizeY + ) + with ir.InsertionPoint(inner_loop.body): + initial_sum = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0)) + + mul_loop = scf.ForOp( + lower_bound=c0.result, + upper_bound=inner_dim, + step=c1.result, + iter_args=[initial_sum] + ) + with ir.InsertionPoint(mul_loop.body): + sum = mul_loop.inner_iter_args[0] + mat1_load = memref.LoadOp(input_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, inner_dim).result, mul_loop.induction_variable)]) + mat2_load = memref.LoadOp(weight_reshape_1d, [arith.AddIOp(arith.MulIOp(mul_loop.induction_variable, col).result, inner_loop.induction_variable)]) + res = arith.MulFOp(mat1_load, mat2_load) + res = arith.AddFOp(sum, res) + scf.YieldOp([res]) + + sum = mul_loop.result + bias_load = memref.LoadOp(bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)]) + res = arith.AddFOp(sum, bias_load) + memref.StoreOp(res, bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)]) + scf.YieldOp([]) + scf.YieldOp([]) + gpu.TerminatorOp() + + + output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) + + # FIXME: Dialect `memref' not found for custom op 'memref.expand_shape' + # axis = ir.ArrayAttr.get( + # [ + # ir.IntegerAttr.get(ir.IntegerType.get_signless(64), i) + # for i in range(len(output_shape)) + # ], + # None, + # ) + # axis = ir.ArrayAttr.get([axis], None) + # bias_reshape = memref.ExpandShapeOp(output_type, bias, axis) + + bias_shape = memref.AllocOp(ir.MemRefType.get([len(output_shape)], ir.IndexType.get()), [], []) + # print("bias_shape: "+str(bias_shape)) + for i in range(len(output_shape)): + memref.StoreOp(arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_shape[i])), bias_shape, [arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))]) + + bias_reshape = memref.ReshapeOp(output_type, bias, bias_shape) + memref.CopyOp(bias_reshape, output) + return output + + ops_registry = { "ReluOp": relu_op, "ViewOp": reshape_op, "PermuteOp": permute_op, "Conv2dOp": convolution2d_op, - "MaxPool2dOp": maxpool2d_op + "MaxPool2dOp": maxpool2d_op, + "AddMMOp": addmm_op } diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py index 839ff268da..d9633cf7b4 100644 --- a/frontend/Python/ops/tosa.py +++ b/frontend/Python/ops/tosa.py @@ -1002,15 +1002,17 @@ def convolution2d_op(node: Conv2dOp, symbol_table): result_element_type = mlir_element_type_get(dtype) out_shape = node.tensor_meta["shape"] + # Prepare Depthwise Conv2D information + is_grouped = (list(weight_shape)[1] == 1) and (groups != 1) + is_depthwise = (groups == list(weight_shape)[0]) and is_grouped + # Prepare input channel and output channel. - # TODO: confirm and modify this part. if is_kernel_transposed: in_channels = list(weight_shape)[0] - out_channels = list(weight_shape)[1] + out_channels = list(weight_shape)[1] * groups else: - in_channels = list(weight_shape)[1] + in_channels = list(weight_shape)[1] * groups out_channels = list(weight_shape)[0] - is_depthwise = (groups == in_channels) or (groups == out_channels) # Prepare bias tensor. if len(node._parents) == 2: @@ -1025,20 +1027,19 @@ def convolution2d_op(node: Conv2dOp, symbol_table): else: bias_tensor = symbol_table.get((str(bias), 0)) - # Prepare input padding. - if len(input_padding) == 1: - input_padding = [input_padding[0]] * 4 - elif len(input_padding) == 2: - input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2 - # Prepare attributes. - input_padding_attr = ir._denseI64ArrayAttr(input_padding, None) dilation_attr = ir._denseI64ArrayAttr(dilation, None) stride_attr = ir._denseI64ArrayAttr(stride, None) - # TODO: Convolution 1D # Convolution 2D if len(weight_shape) == 4: + # Prepare input padding. + if len(input_padding) == 1: + input_padding = [input_padding[0]] * 4 + elif len(input_padding) == 2: + input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2 + # Prepare input_padding attributes. + input_padding_attr = ir._denseI64ArrayAttr(input_padding, None) # If the input layout is NCHW, then convert to NHWC. if node._layout.find("NCHW") != -1: perm_list = [0, 2, 3, 1] @@ -1068,9 +1069,9 @@ def convolution2d_op(node: Conv2dOp, symbol_table): out_shape = perm_shape output_type = ir.RankedTensorType.get(out_shape, result_element_type) + # Depthwise Conv2D Operation. if is_depthwise is True: - # Depthwise Conv2D Operation. - # TODO: the layout may lead misunderstanding + # If groups == in_channels,out_channels == in_channels if node._layout.find("FCHW") != -1: perm_list = [2, 3, 0, 1] perm_const_op = tosa.ConstOp( @@ -1166,9 +1167,88 @@ def convolution2d_op(node: Conv2dOp, symbol_table): op = tosa.TransposeOp( permute_result_type, op.result, perm_const_op.results[0] ) + # Convolution 1D + elif len(weight_shape) == 3: + # Prepare input with padding. + if input_padding[0] != 0: + input_shape = list(ir.RankedTensorType(input_val.type).shape) + padded_type = ir.RankedTensorType.get( + [ + input_shape[0], + input_shape[1], + input_shape[2] + 2 * input_padding[0], + ], + result_element_type, + ) + pad_values_type = ir.RankedTensorType.get( + [3, 2], ir.IntegerType.get_signless(32) + ) + pad_values = ir.DenseElementsAttr.get( + numpy.array( + [[0, 0], [0, 0], [input_padding[0], input_padding[0]]], + dtype=numpy.int32, + ), + type=pad_values_type, + ) + pad_constant = arith.ConstantOp(pad_values_type, pad_values).result + input_val = tosa.PadOp(padded_type, input_val, pad_constant) + output_type = ir.RankedTensorType.get(out_shape, result_element_type) + output_conv = tensor.EmptyOp(list(out_shape), result_element_type) + assert groups == 1, "only support one group" + # Con1D Operation Without Bias + conv_op = linalg.conv_1d_ncw_fcw( + input_val, + weight_val, + outs=[output_conv], + strides=stride_attr, + dilations=dilation_attr, + ) + output = tensor.EmptyOp(list(out_shape), result_element_type) + generic_map = ir.AffineMap.get_permutation( + [i for i in range(len(list(out_shape)))] + ) + loop_type = [ + ir.Attribute.parse("#linalg.iterator_type") + ] * len(list(out_shape)) + loop_type[1] = ir.Attribute.parse("#linalg.iterator_type") + # Add Bias To Conv2d. + op = linalg.GenericOp( + [output_type], + [conv_op, bias_tensor], + [output], + ir.ArrayAttr.get( + [ + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(list(out_shape)))] + ) + ), + ir.AffineMapAttr.get(generic_map.get_submap([1])), + ir.AffineMapAttr.get( + generic_map.get_submap( + [i for i in range(len(list(out_shape)))] + ) + ), + ] + ), + ir.ArrayAttr.get(loop_type), + ) + block = ir.Block.create_at_start( + op.region, + [ + result_element_type, + ir.RankedTensorType(bias_tensor.type).element_type, + result_element_type, + ], + ) + add_op = arith.AddFOp(block.arguments[1], block.arguments[0]) + block.append(add_op) + block.append(linalg.YieldOp([add_op.result])) + return op + def relu_op(node: ReluOp, symbol_table): """ Import the tensor relu operation. From 9a88cb2e4e2cccfcff10ca989940df0401826316 Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Sat, 21 Sep 2024 02:58:35 +0000 Subject: [PATCH 18/29] [frontend] Add the custom subgraph partitioning interface --- examples/BuddyLeNet/CMakeLists.txt | 23 +++- examples/BuddyLeNet/buddy-lenet-import.py | 4 + examples/BuddyLeNet/subgraph1.mlir | 25 +++++ frontend/Python/graph/graph.py | 31 ++++-- frontend/Python/graph/graph_driver.py | 128 +++++++++++++++------- frontend/Python/graph/operation.py | 4 +- frontend/Python/ops/func.py | 4 +- 7 files changed, 167 insertions(+), 52 deletions(-) create mode 100644 examples/BuddyLeNet/subgraph1.mlir diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index c10571f883..f391347466 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -1,7 +1,7 @@ add_custom_command( - OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data + OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py - COMMENT "Generating forward.mlir, subgraph0.mlir and parameter files" + COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files" ) add_custom_command( @@ -33,6 +33,23 @@ add_custom_command( DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll COMMENT "Building subgraph0.o" VERBATIM) + +add_custom_command( + OUTPUT subgraph1.ll + COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + COMMENT "Building subgraph1.ll" + VERBATIM) + +add_custom_command( + OUTPUT subgraph1.o + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o + DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll + COMMENT "Building subgraph1.o" + VERBATIM) # add_custom_command( # OUTPUT subgraph0.o @@ -66,7 +83,7 @@ add_custom_command( # COMMENT "Building subgraph0.o" # VERBATIM) -add_library(LENET STATIC subgraph0.o forward.o) +add_library(LENET STATIC subgraph0.o subgraph1.o forward.o) SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 65ef5127f3..4b3160ea4e 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -59,10 +59,14 @@ pattern_list = [simply_fuse] graphs[0].fuse_ops(pattern_list) driver = GraphDriver(graphs[0]) +print(len(driver.subgraphs)) driver.subgraphs[0].lower_to_top_level_ir() +driver.subgraphs[1].lower_to_top_level_ir() path_prefix = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) +with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file: + print(driver.subgraphs[1]._imported_module, file=module_file) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: print(driver.construct_main_graph(True), file=module_file) diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir new file mode 100644 index 0000000000..964dc7bbb8 --- /dev/null +++ b/examples/BuddyLeNet/subgraph1.mlir @@ -0,0 +1,25 @@ +#map = affine_map<(d0, d1) -> (d1, d0)> +module attributes {gpu.container_module} { + func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>) { + %0 = bufferization.to_memref %arg0 : memref<120x256xf32> + %1 = bufferization.to_memref %arg1 : memref<84x120xf32> + %2 = bufferization.to_memref %arg2 : memref<10x84xf32> + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %3 = vector.transfer_read %0[%c0, %c0], %cst {permutation_map = #map} : memref<120x256xf32>, vector<256x120xf32> + %alloc = memref.alloc() : memref<256x120xf32> + vector.transfer_write %3, %alloc[%c0, %c0] : vector<256x120xf32>, memref<256x120xf32> + %c0_0 = arith.constant 0 : index + %cst_1 = arith.constant 0.000000e+00 : f32 + %4 = vector.transfer_read %1[%c0_0, %c0_0], %cst_1 {permutation_map = #map} : memref<84x120xf32>, vector<120x84xf32> + %alloc_2 = memref.alloc() : memref<120x84xf32> + vector.transfer_write %4, %alloc_2[%c0_0, %c0_0] : vector<120x84xf32>, memref<120x84xf32> + %c0_3 = arith.constant 0 : index + %cst_4 = arith.constant 0.000000e+00 : f32 + %5 = vector.transfer_read %2[%c0_3, %c0_3], %cst_4 {permutation_map = #map} : memref<10x84xf32>, vector<84x10xf32> + %alloc_5 = memref.alloc() : memref<84x10xf32> + vector.transfer_write %5, %alloc_5[%c0_3, %c0_3] : vector<84x10xf32>, memref<84x10xf32> + return %alloc, %alloc_2, %alloc_5 : memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32> + } +} + diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 3ca8aa298a..b1c8666c38 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -171,13 +171,26 @@ def init_op_group(self): Returns: - None """ + # for i, op in enumerate(self._body): + # if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp): + # continue + # group = [op] + # subgraph_name = "subgraph{}".format(i) + # self.group_map_device[subgraph_name] = DeviceType.GPU + # self.op_groups[subgraph_name] = group + group = [] for i, op in enumerate(self._body): - if isinstance(op, PlaceholderOp): + if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i==18 or i==21 or i==24: continue - group = [op] - subgraph_name = "subgraph{}".format(i) - self.group_map_device[subgraph_name] = DeviceType.GPU - self.op_groups[subgraph_name] = group + group.append(op) + subgraph_name = "subgraph0" + self.group_map_device[subgraph_name] = DeviceType.GPU + self.op_groups[subgraph_name] = group + + new_group = [self._body[18], self._body[21], self._body[24]] + subgraph_name = "subgraph1" + self.group_map_device[subgraph_name] = DeviceType.GPU + self.op_groups[subgraph_name] = new_group def fuse_ops(self, pattern_list: List[FunctionType]): """ @@ -197,9 +210,9 @@ def fuse_ops(self, pattern_list: List[FunctionType]): # Initialize operation groups self.init_op_group() - # Apply fusion patterns - for pattern_func in pattern_list: - pattern_func(self) + # # Apply fusion patterns + # for pattern_func in pattern_list: + # pattern_func(self) def perform(self, func_list: List[FunctionType]): """ @@ -541,7 +554,7 @@ def generated_func(*args): ] else: self._import_op(node) - + print(self._symbol_table) return self._symbol_table.get(("output", 0)) return self._module diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py index 62a1239859..a8fbbf1f71 100644 --- a/frontend/Python/graph/graph_driver.py +++ b/frontend/Python/graph/graph_driver.py @@ -21,6 +21,7 @@ # ===--------------------------------------------------------------------------- from mlir import ir +from collections import deque, defaultdict from .graph import Graph, GraphImporter, TensorMeta from .operation import FuncOp, CallOp, PlaceholderOp, OutputOp, GetItemOp @@ -52,6 +53,11 @@ def __init__(self, graph: Graph) -> None: - None """ self._graph = graph + self._subgraph_dependencies = { + subgraph_name : set() + for subgraph_name in list(self._graph.op_groups.keys()) + } + self._call_table = {} ( self._subgraphs, self._subgraphs_inputs, @@ -95,13 +101,14 @@ def build_subgraph_by_group(self): for arg in node.args: output_node.append(arg) - # Identify outputs for each subgraph + # Identify outputs for each subgraph and build dependencies between subgraphs for subgraph_name in self._graph.op_groups.keys(): subgraphs_outputs[subgraph_name] = [] for op in self._graph.op_groups[subgraph_name]: for key in subgraphs_inputs.keys(): if op.name in subgraphs_inputs[key]: subgraphs_outputs[subgraph_name].append(op.name) + self._subgraph_dependencies[subgraph_name].add(key) if (op.name in output_node) and ( op.name not in subgraphs_outputs[subgraph_name] ): @@ -156,6 +163,38 @@ def build_subgraph_by_group(self): return subgraphs, subgraphs_inputs, subgraphs_outputs + def topological_sort_subgraph(self): + """ + Performs topological sorting on the subgraphs based on their dependencies. + + Args: + - graph (Graph): The graph from which subgraphs are constructed. + + Returns: + - list: A list of subgraph names in topological order if the graph is acyclic; otherwise, None. + """ + + # Calculate in degree of each subgraph + in_degree = { subgraph_name : 0 for subgraph_name in list(self._subgraphs.keys()) } + for src, dests in self._subgraph_dependencies.items(): + for dest in dests: + in_degree[dest] += 1 + + # Topological sorting + queue = deque([node for node in in_degree if in_degree[node] == 0]) + topo_order = [] + + while queue: + node = queue.popleft() + topo_order.append(node) + for child in self._subgraph_dependencies[node]: + in_degree[child] -= 1 + if in_degree[child] == 0: + queue.append(child) + + # TODO: If the custom subgraph partitioning is illegal, further partition the subgraph to make it valid. + return topo_order if len(topo_order) == len(list(self._subgraphs.keys())) else None + def construct_main_graph(self, do_param_pack=False): """ Constructs the main computational graph by incorporating subgraphs' call @@ -193,53 +232,68 @@ def construct_main_graph(self, do_param_pack=False): func_node.tensor_meta["dtype"].append( self._graph.node_table[output].tensor_meta["dtype"] ) - main_graph.body.append(func_node) + main_graph.add_node(func_node) # Adding placeholder operations from the original graph for op in self._graph.body: if isinstance(op, PlaceholderOp): - main_graph.body.append(op) + main_graph.add_node(op) + + # Analysis topology order to sort subgraph call. + topo_order = self.topological_sort_subgraph() + if topo_order == None: + print('Error : Graph Partitioning is illegal!') + return None - # TODO: analysis topology order to sort subgraph call. - if len(self._subgraphs) == 1: - # Adding CallOp to invoke the single subgraph + # Adding CallOp to invoke the single subgraph + for i, subgraph_name in enumerate(topo_order): call_node = CallOp() - call_node.name = "call0" - call_node.call_func_name = list(self._subgraphs.keys())[0] + call_node.name = "call{}".format(i) + call_node.call_func_name = subgraph_name call_node.tensor_meta = {"shape": [], "dtype": []} - for inp in list(self._subgraphs_inputs.values())[0]: - call_node.add_argument(inp) - for output in list(self._subgraphs_outputs.values())[0]: + for inp in self._subgraphs_inputs[subgraph_name]: + if inp in main_graph.node_table: + call_node.add_argument(inp) + continue + for key, value in self._subgraphs_outputs.items(): + if inp in value: + call_node.add_argument( + arg=self._call_table[key].name, + arg_index=value.index(inp) + ) + break + for output in self._subgraphs_outputs[subgraph_name]: call_node.tensor_meta["shape"].append( self._graph.node_table[output].tensor_meta["shape"] ) call_node.tensor_meta["dtype"].append( self._graph.node_table[output].tensor_meta["dtype"] ) - main_graph.body.append(call_node) + self._call_table[subgraph_name] = call_node + main_graph.add_node(call_node) - # Adding GetItemOps to retrieve individual output tensors - output_node = OutputOp() - for i, output in enumerate(list(self._subgraphs_outputs.values())[0]): - getitem_node = GetItemOp() - getitem_node.add_argument(call_node.name) - getitem_node.add_argument(i) - getitem_node.name = "getitem{}".format(i) - output_node.add_argument(getitem_node.name) - main_graph.body.append(getitem_node) - - # Marking the final output of the main graph - output_node.name = "output" - main_graph.body.append(output_node) - - # Importing the main graph - with ir.Location.unknown(ir.Context()): - main_importer = GraphImporter( - main_graph.body, - main_graph._fake_params, - main_graph._inputs, - main_graph._func_name, - main_graph._ops_registry, - do_param_pack, - ) - return main_importer.import_main_graph() + # Adding GetItemOps to retrieve individual output tensors + output_node = OutputOp() + for i, output in enumerate(self._subgraphs_outputs[topo_order[-1]]): + getitem_node = GetItemOp() + getitem_node.add_argument(call_node.name) + getitem_node.add_argument(i) + getitem_node.name = "getitem{}".format(i) + output_node.add_argument(getitem_node.name) + main_graph.add_node(getitem_node) + + # Marking the final output of the main graph + output_node.name = "output" + main_graph.add_node(output_node) + + # Importing the main graph + with ir.Location.unknown(ir.Context()): + main_importer = GraphImporter( + main_graph.body, + main_graph._fake_params, + main_graph._inputs, + main_graph._func_name, + main_graph._ops_registry, + do_param_pack, + ) + return main_importer.import_main_graph() diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py index 14bfbf2752..c2dc186a39 100644 --- a/frontend/Python/graph/operation.py +++ b/frontend/Python/graph/operation.py @@ -81,13 +81,14 @@ def __init__(self) -> None: """ self._name = None self._arguments = [] + self._args_index = [] self._keyword_arguments = {} self._tensor_meta: Dict = {} self._op_type: OpType = None self._children: List[str] = [] self._parents: List[str] = [] - def add_argument(self, arg): + def add_argument(self, arg, arg_index=0): """ Add an input argument to the operation node. @@ -96,6 +97,7 @@ def add_argument(self, arg): The input argument to be added. """ self._arguments.append(arg) + self._args_index.append(arg_index) def add_parent(self, parent: str): """ diff --git a/frontend/Python/ops/func.py b/frontend/Python/ops/func.py index a7dcc5e11b..e885809d82 100644 --- a/frontend/Python/ops/func.py +++ b/frontend/Python/ops/func.py @@ -59,8 +59,8 @@ def call_op(node: CallOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): From Buddy CallOp to MLIR FUNC call operation. """ arguments = [] - for arg in node.args: - input_node = symbol_table.get((str(arg), 0)) + for i, arg in enumerate(node.args): + input_node = symbol_table.get((str(arg), node._args_index[i])) memref_type = ir.MemRefType(input_node.type) stride = [] shape = memref_type.shape From 2f91175702db755addbc0fb013a8e2a23d40901a Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Sun, 22 Sep 2024 10:37:14 +0000 Subject: [PATCH 19/29] [frontend] Fix error in graph partitioning interface --- examples/BuddyLeNet/CMakeLists.txt | 88 +++++++++++------------ examples/BuddyLeNet/buddy-lenet-import.py | 4 +- examples/BuddyLeNet/subgraph1.mlir | 31 +++----- frontend/Python/frontend.py | 4 ++ frontend/Python/graph/graph.py | 30 +++++--- frontend/Python/graph/graph_driver.py | 3 + 6 files changed, 83 insertions(+), 77 deletions(-) diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index f391347466..7552f25f55 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -19,7 +19,7 @@ add_custom_command( add_custom_command( OUTPUT subgraph0.ll - COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | + COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | ${LLVM_MLIR_BINARY_DIR}/mlir-opt -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll @@ -34,55 +34,55 @@ add_custom_command( COMMENT "Building subgraph0.o" VERBATIM) -add_custom_command( - OUTPUT subgraph1.ll - COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll - DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir - COMMENT "Building subgraph1.ll" - VERBATIM) +# add_custom_command( +# OUTPUT subgraph1.ll +# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | +# ${LLVM_MLIR_BINARY_DIR}/mlir-opt +# -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | +# ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll +# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir +# COMMENT "Building subgraph1.ll" +# VERBATIM) +# add_custom_command( +# OUTPUT subgraph1.o +# COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o +# DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll +# COMMENT "Building subgraph1.o" +# VERBATIM) + add_custom_command( OUTPUT subgraph1.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o - DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | + ${BUDDY_BINARY_DIR}/buddy-opt + -eliminate-empty-tensors + -convert-tensor-to-linalg + -linalg-bufferize + -convert-linalg-to-affine-loops + -lower-affine + -func-bufferize-dynamic-offset + -arith-bufferize + -tensor-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llvm-as | + ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir COMMENT "Building subgraph1.o" VERBATIM) -# add_custom_command( -# OUTPUT subgraph0.o -# COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | -# ${BUDDY_BINARY_DIR}/buddy-opt -# -eliminate-empty-tensors -# -convert-tensor-to-linalg -# -linalg-bufferize -# -convert-linalg-to-affine-loops -# -lower-affine -# -func-bufferize-dynamic-offset -# -arith-bufferize -# -tensor-bufferize -# -buffer-deallocation -# -finalizing-bufferize -# -convert-vector-to-scf -# -expand-strided-metadata -# -convert-vector-to-llvm -# -convert-arith-to-llvm -# -finalize-memref-to-llvm -# -convert-scf-to-cf -# -llvm-request-c-wrappers -# -convert-arith-to-llvm -# -convert-func-to-llvm -# -reconcile-unrealized-casts | -# ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | -# ${LLVM_MLIR_BINARY_DIR}/llvm-as | -# ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o -# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# COMMENT "Building subgraph0.o" -# VERBATIM) - add_library(LENET STATIC subgraph0.o subgraph1.o forward.o) SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 4b3160ea4e..fa0883c8b9 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -44,7 +44,7 @@ # Initialize Dynamo Compiler with specific configurations as an importer. dynamo_compiler = DynamoCompiler( - primary_registry=gpu.ops_registry, + primary_registry=tosa.ops_registry, aot_autograd_decomposition=inductor_decomp, ) @@ -60,6 +60,8 @@ graphs[0].fuse_ops(pattern_list) driver = GraphDriver(graphs[0]) print(len(driver.subgraphs)) +print(driver.subgraphs[0].device) +print(driver.subgraphs[1].device) driver.subgraphs[0].lower_to_top_level_ir() driver.subgraphs[1].lower_to_top_level_ir() path_prefix = os.path.dirname(os.path.abspath(__file__)) diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir index 964dc7bbb8..a5d052b8d6 100644 --- a/examples/BuddyLeNet/subgraph1.mlir +++ b/examples/BuddyLeNet/subgraph1.mlir @@ -1,25 +1,12 @@ -#map = affine_map<(d0, d1) -> (d1, d0)> -module attributes {gpu.container_module} { - func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32>) { - %0 = bufferization.to_memref %arg0 : memref<120x256xf32> - %1 = bufferization.to_memref %arg1 : memref<84x120xf32> - %2 = bufferization.to_memref %arg2 : memref<10x84xf32> - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %3 = vector.transfer_read %0[%c0, %c0], %cst {permutation_map = #map} : memref<120x256xf32>, vector<256x120xf32> - %alloc = memref.alloc() : memref<256x120xf32> - vector.transfer_write %3, %alloc[%c0, %c0] : vector<256x120xf32>, memref<256x120xf32> - %c0_0 = arith.constant 0 : index - %cst_1 = arith.constant 0.000000e+00 : f32 - %4 = vector.transfer_read %1[%c0_0, %c0_0], %cst_1 {permutation_map = #map} : memref<84x120xf32>, vector<120x84xf32> - %alloc_2 = memref.alloc() : memref<120x84xf32> - vector.transfer_write %4, %alloc_2[%c0_0, %c0_0] : vector<120x84xf32>, memref<120x84xf32> - %c0_3 = arith.constant 0 : index - %cst_4 = arith.constant 0.000000e+00 : f32 - %5 = vector.transfer_read %2[%c0_3, %c0_3], %cst_4 {permutation_map = #map} : memref<10x84xf32>, vector<84x10xf32> - %alloc_5 = memref.alloc() : memref<84x10xf32> - vector.transfer_write %5, %alloc_5[%c0_3, %c0_3] : vector<84x10xf32>, memref<84x10xf32> - return %alloc, %alloc_2, %alloc_5 : memref<256x120xf32>, memref<120x84xf32>, memref<84x10xf32> +module { + func.func @subgraph1(%arg0: tensor<120x256xf32>, %arg1: tensor<84x120xf32>, %arg2: tensor<10x84xf32>) -> (tensor<256x120xf32>, tensor<120x84xf32>, tensor<84x10xf32>) { + %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %1 = tosa.transpose %arg0, %0 : (tensor<120x256xf32>, tensor<2xi32>) -> tensor<256x120xf32> + %2 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %3 = tosa.transpose %arg1, %2 : (tensor<84x120xf32>, tensor<2xi32>) -> tensor<120x84xf32> + %4 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %5 = tosa.transpose %arg2, %4 : (tensor<10x84xf32>, tensor<2xi32>) -> tensor<84x10xf32> + return %1, %3, %5 : tensor<256x120xf32>, tensor<120x84xf32>, tensor<84x10xf32> } } diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py index f30eb2a28a..dec046024f 100644 --- a/frontend/Python/frontend.py +++ b/frontend/Python/frontend.py @@ -42,6 +42,7 @@ from .ops.tosa import ops_registry as tosa_ops_registry from .ops.math import ops_registry as math_ops_registry from .ops.func import ops_registry as func_ops_registry +from .ops.gpu import ops_registry as gpu_ops_registry from .graph import Graph, TensorDType, TensorMeta from .graph.operation import * from .graph.transform import maxpool2d_simplify @@ -98,12 +99,14 @@ def __init__( self._verbose = verbose self._imported_graphs = [] self._ops_registry = {} + self._ops_gpu_registry = {} self._imported_params = {} self._ops_registry.update(math_ops_registry) self._ops_registry.update(linalg_ops_registry) self._ops_registry.update(tosa_ops_registry) self._ops_registry.update(func_ops_registry) self._ops_registry.update(primary_registry) + self._ops_gpu_registry.update(gpu_ops_registry) self._ops_map = { "output": OutputOp, "placeholder": PlaceholderOp, @@ -276,6 +279,7 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]): func_inputs, fake_params, self._ops_registry, + self._ops_gpu_registry, self._func_name, ) for gm_node in _gm.graph.nodes: diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index b1c8666c38..86867b4d4d 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -105,6 +105,7 @@ def __init__( inputs: List[TensorMeta], fake_params: List[TensorMeta], ops_registry: dict, + ops_gpu_registry: dict, func_name: str, device: DeviceType = DeviceType.GPU ) -> None: @@ -128,6 +129,7 @@ def __init__( self.device = device self._imported_module = None self._ops_registry = ops_registry + self._ops_gpu_registry = ops_gpu_registry self._func_name = func_name self._ctx = ir.Context() self._output_memref = None @@ -189,7 +191,7 @@ def init_op_group(self): new_group = [self._body[18], self._body[21], self._body[24]] subgraph_name = "subgraph1" - self.group_map_device[subgraph_name] = DeviceType.GPU + self.group_map_device[subgraph_name] = DeviceType.CPU self.op_groups[subgraph_name] = new_group def fuse_ops(self, pattern_list: List[FunctionType]): @@ -252,6 +254,7 @@ def lower_to_top_level_ir(self): self._inputs, self._func_name, self._ops_registry, + self._ops_gpu_registry, False, self.device ) @@ -261,7 +264,7 @@ def lower_to_top_level_ir(self): output_ranks = [] output_dtypes = [] for out_node in outputs: - out_type = ir.MemRefType(out_node.type) + out_type = ir.RankedTensorType(out_node.type) shape = list(out_type.shape) dtype = out_type.element_type match str(dtype): @@ -363,6 +366,7 @@ def __init__( inputs: List[TensorMeta], func_name: str, ops_registry: dict, + ops_gpu_registry: dict, do_param_pack: bool = False, device: DeviceType = DeviceType.CPU, ): @@ -388,6 +392,7 @@ def __init__( self._num_input_visited = 0 self._module = ir.Module.create() self._ops_registry = ops_registry + self._ops_gpu_registry = ops_gpu_registry self._current_param_pack_offset = None def _str_to_mlir_dtype(self, dtype: str) -> ir.Type: @@ -479,11 +484,11 @@ def generated_func(*args): self._symbol_table.get((str(output_arg), 0)) for output_arg in output_node_args ] - # if self._device == DeviceType.GPU: - # returns = [ - # buffer.to_tensor(ret) - # for ret in returns - # ] + if self._device == DeviceType.GPU: + returns = [ + buffer.to_tensor(ret) + for ret in returns + ] self._symbol_table[("output", 0)] = returns elif isinstance(node, PlaceholderOp): self._import_placeholder(node, args_list) @@ -623,9 +628,14 @@ def _import_op(self, node: Op): """ op_name = node.__class__.__name__ - op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( - self._ops_registry[op_name](node, self._symbol_table) - ) + if self._device == DeviceType.CPU: + op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( + self._ops_registry[op_name](node, self._symbol_table) + ) + else: + op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( + self._ops_gpu_registry[op_name](node, self._symbol_table) + ) if isinstance(op_ret, tuple | List): for i, operation in enumerate(op_ret): if isinstance(operation, ir.Operation) or isinstance( diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py index a8fbbf1f71..8ff8966be2 100644 --- a/frontend/Python/graph/graph_driver.py +++ b/frontend/Python/graph/graph_driver.py @@ -153,6 +153,7 @@ def build_subgraph_by_group(self): subgraph_input, [], self._graph._ops_registry, + self._graph._ops_gpu_registry, subgraph_name, subgraph_device ) @@ -215,6 +216,7 @@ def construct_main_graph(self, do_param_pack=False): self._graph._inputs, self._graph._fake_params, self._graph._ops_registry, + self._graph._ops_gpu_registry, self._graph._func_name, ) @@ -294,6 +296,7 @@ def construct_main_graph(self, do_param_pack=False): main_graph._inputs, main_graph._func_name, main_graph._ops_registry, + main_graph._ops_gpu_registry, do_param_pack, ) return main_importer.import_main_graph() From 29745ef8e100ed2003fdf4b3bd6dad61dbf3256c Mon Sep 17 00:00:00 2001 From: wdjyd <1014108056@qq.com> Date: Thu, 26 Sep 2024 03:41:12 +0000 Subject: [PATCH 20/29] [frontend] Add JSON format interface for subgraph partitioning implementation --- examples/BuddyLeNet/graph.dot | 56 ---- examples/BuddyLeNet/lenet.json | 1 - examples/BuddyLeNet/subgraph1.mlir | 516 ----------------------------- thirdparty/mimalloc | 1 - 4 files changed, 574 deletions(-) delete mode 100644 examples/BuddyLeNet/graph.dot delete mode 100644 examples/BuddyLeNet/lenet.json delete mode 100644 examples/BuddyLeNet/subgraph1.mlir delete mode 160000 thirdparty/mimalloc diff --git a/examples/BuddyLeNet/graph.dot b/examples/BuddyLeNet/graph.dot deleted file mode 100644 index 04313d9e35..0000000000 --- a/examples/BuddyLeNet/graph.dot +++ /dev/null @@ -1,56 +0,0 @@ -// Buddy Graph -digraph { - arg0_1 -> convolution - arg1_1 -> convolution - arg2_1 -> convolution_1 - arg3_1 -> convolution_1 - arg4_1 -> permute - arg5_1 -> addmm - arg6_1 -> permute_1 - arg7_1 -> addmm_1 - arg8_1 -> permute_2 - arg9_1 -> addmm_2 - arg10_1 -> convolution - convolution -> relu - relu -> max_pool2d - max_pool2d -> convolution_1 - convolution_1 -> relu_1 - relu_1 -> max_pool2d_1 - max_pool2d_1 -> view - view -> addmm - permute -> addmm - addmm -> relu_2 - relu_2 -> addmm_1 - permute_1 -> addmm_1 - addmm_1 -> relu_3 - relu_3 -> addmm_2 - permute_2 -> addmm_2 - addmm_2 -> output - arg0_1 [fillcolor=white shape=ellipse style=filled] - arg1_1 [fillcolor=white shape=ellipse style=filled] - arg2_1 [fillcolor=white shape=ellipse style=filled] - arg3_1 [fillcolor=white shape=ellipse style=filled] - arg4_1 [fillcolor=white shape=ellipse style=filled] - arg5_1 [fillcolor=white shape=ellipse style=filled] - arg6_1 [fillcolor=white shape=ellipse style=filled] - arg7_1 [fillcolor=white shape=ellipse style=filled] - arg8_1 [fillcolor=white shape=ellipse style=filled] - arg9_1 [fillcolor=white shape=ellipse style=filled] - arg10_1 [fillcolor=white shape=ellipse style=filled] - convolution [fillcolor=deepskyblue shape=box style=filled] - relu [fillcolor=deepskyblue shape=box style=filled] - max_pool2d [fillcolor=red shape=box style=filled] - convolution_1 [fillcolor=deepskyblue shape=box style=filled] - relu_1 [fillcolor=deepskyblue shape=box style=filled] - max_pool2d_1 [fillcolor=red shape=box style=filled] - view [fillcolor=deepskyblue shape=box style=filled] - permute [fillcolor=deepskyblue shape=box style=filled] - addmm [fillcolor=deepskyblue shape=box style=filled] - relu_2 [fillcolor=deepskyblue shape=box style=filled] - permute_1 [fillcolor=deepskyblue shape=box style=filled] - addmm_1 [fillcolor=deepskyblue shape=box style=filled] - relu_3 [fillcolor=deepskyblue shape=box style=filled] - permute_2 [fillcolor=deepskyblue shape=box style=filled] - addmm_2 [fillcolor=deepskyblue shape=box style=filled] - output [fillcolor=white shape=ellipse style=filled] -} diff --git a/examples/BuddyLeNet/lenet.json b/examples/BuddyLeNet/lenet.json deleted file mode 100644 index aa0ceb90af..0000000000 --- a/examples/BuddyLeNet/lenet.json +++ /dev/null @@ -1 +0,0 @@ -{"graph_name": "forward", "nodes": [{"name": "arg0_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6, 1, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg1_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [6], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg2_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16, 6, 5, 5], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg3_1", "children": ["convolution_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [16], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg4_1", "children": ["permute"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 256], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg5_1", "children": ["addmm"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg6_1", "children": ["permute_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 120], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg7_1", "children": ["addmm_1"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg8_1", "children": ["permute_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10, 84], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg9_1", "children": ["addmm_2"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [10], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "arg10_1", "children": ["convolution"], "parents": [], "arguments": [], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 1, 28, 28], "dtype": "Float32"}, "type": "PlaceholderType", "class": "PlaceholderOp"}, {"name": "convolution", "children": ["relu"], "parents": ["arg10_1", "arg0_1", "arg1_1"], "arguments": ["arg10_1", "arg0_1", "arg1_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu", "children": ["max_pool2d"], "parents": ["convolution"], "arguments": ["convolution"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 24, 24], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d", "children": ["convolution_1"], "parents": ["relu"], "arguments": ["relu", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 6, 12, 12], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "convolution_1", "children": ["relu_1"], "parents": ["max_pool2d", "arg2_1", "arg3_1"], "arguments": ["max_pool2d", "arg2_1", "arg3_1", [1, 1], [0, 0], [1, 1], false, [0, 0], 1], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ReduceType", "class": "Conv2dOp"}, {"name": "relu_1", "children": ["max_pool2d_1"], "parents": ["convolution_1"], "arguments": ["convolution_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 8, 8], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "max_pool2d_1", "children": ["view"], "parents": ["relu_1"], "arguments": ["relu_1", [2, 2], [2, 2]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 16, 4, 4], "dtype": "Float32"}, "type": "ReduceType", "class": "MaxPool2dOp"}, {"name": "view", "children": ["addmm"], "parents": ["max_pool2d_1"], "arguments": ["max_pool2d_1", [-1, 256]], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 256], "dtype": "Float32"}, "type": "ReshapeType", "class": "ViewOp"}, {"name": "permute", "children": ["addmm"], "parents": ["arg4_1"], "arguments": ["arg4_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [256, 120], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm", "children": ["relu_2"], "parents": ["arg5_1", "view", "permute"], "arguments": ["arg5_1", "view", "permute"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_2", "children": ["addmm_1"], "parents": ["addmm"], "arguments": ["addmm"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 120], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_1", "children": ["addmm_1"], "parents": ["arg6_1"], "arguments": ["arg6_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [120, 84], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_1", "children": ["relu_3"], "parents": ["arg7_1", "relu_2", "permute_1"], "arguments": ["arg7_1", "relu_2", "permute_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "relu_3", "children": ["addmm_2"], "parents": ["addmm_1"], "arguments": ["addmm_1"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 84], "dtype": "Float32"}, "type": "ElementwiseType", "class": "ReluOp"}, {"name": "permute_2", "children": ["addmm_2"], "parents": ["arg8_1"], "arguments": ["arg8_1", [1, 0]], "keyword_arguments": {}, "tensor_meta": {"shape": [84, 10], "dtype": "Float32"}, "type": "ReshapeType", "class": "PermuteOp"}, {"name": "addmm_2", "children": ["output"], "parents": ["arg9_1", "relu_3", "permute_2"], "arguments": ["arg9_1", "relu_3", "permute_2"], "keyword_arguments": {}, "tensor_meta": {"shape": [1, 10], "dtype": "Float32"}, "type": "ReduceType", "class": "AddMMOp"}, {"name": "output", "children": [], "parents": [], "arguments": ["addmm_2"], "keyword_arguments": {}, "tensor_meta": {}, "type": "GetItemType", "class": "OutputOp"}], "device": "cpu", "params": [{"shape": [6, 1, 5, 5], "dtype": "Float32"}, {"shape": [6], "dtype": "Float32"}, {"shape": [16, 6, 5, 5], "dtype": "Float32"}, {"shape": [16], "dtype": "Float32"}, {"shape": [120, 256], "dtype": "Float32"}, {"shape": [120], "dtype": "Float32"}, {"shape": [84, 120], "dtype": "Float32"}, {"shape": [84], "dtype": "Float32"}, {"shape": [10, 84], "dtype": "Float32"}, {"shape": [10], "dtype": "Float32"}], "inputs": [{"shape": [1, 1, 28, 28], "dtype": "Float32"}], "node_map_device": {"convolution": "gpu", "relu": "gpu", "max_pool2d": "gpu", "convolution_1": "gpu", "relu_1": "gpu", "max_pool2d_1": "gpu", "view": "gpu", "addmm": "gpu", "relu_2": "gpu", "addmm_1": "gpu", "relu_3": "gpu", "addmm_2": "gpu", "permute": "cpu", "permute_1": "cpu", "permute_2": "cpu"}} \ No newline at end of file diff --git a/examples/BuddyLeNet/subgraph1.mlir b/examples/BuddyLeNet/subgraph1.mlir deleted file mode 100644 index 918a5569cf..0000000000 --- a/examples/BuddyLeNet/subgraph1.mlir +++ /dev/null @@ -1,516 +0,0 @@ -module attributes {gpu.container_module} { - func.func @subgraph1(%arg0: tensor<1x1x28x28xf32>, %arg1: tensor<6x1x5x5xf32>, %arg2: tensor<6xf32>, %arg3: tensor<16x6x5x5xf32>, %arg4: tensor<16xf32>, %arg5: tensor<120xf32>, %arg6: tensor<256x120xf32>, %arg7: tensor<84xf32>, %arg8: tensor<120x84xf32>, %arg9: tensor<10xf32>, %arg10: tensor<84x10xf32>) -> tensor<1x10xf32> { - %0 = bufferization.to_memref %arg0 : memref<1x1x28x28xf32> - %1 = bufferization.to_memref %arg1 : memref<6x1x5x5xf32> - %2 = bufferization.to_memref %arg2 : memref<6xf32> - %3 = bufferization.to_memref %arg3 : memref<16x6x5x5xf32> - %4 = bufferization.to_memref %arg4 : memref<16xf32> - %5 = bufferization.to_memref %arg5 : memref<120xf32> - %6 = bufferization.to_memref %arg6 : memref<256x120xf32> - %7 = bufferization.to_memref %arg7 : memref<84xf32> - %8 = bufferization.to_memref %arg8 : memref<120x84xf32> - %9 = bufferization.to_memref %arg9 : memref<10xf32> - %10 = bufferization.to_memref %arg10 : memref<84x10xf32> - %alloc = memref.alloc() : memref<1x6x24x24xf32> - %cast = memref.cast %0 : memref<1x1x28x28xf32> to memref<*xf32> - %cast_0 = memref.cast %1 : memref<6x1x5x5xf32> to memref<*xf32> - %cast_1 = memref.cast %2 : memref<6xf32> to memref<*xf32> - %cast_2 = memref.cast %alloc : memref<1x6x24x24xf32> to memref<*xf32> - gpu.host_register %cast : memref<*xf32> - gpu.host_register %cast_0 : memref<*xf32> - gpu.host_register %cast_1 : memref<*xf32> - gpu.host_register %cast_2 : memref<*xf32> - %c1 = arith.constant 1 : index - %c1_3 = arith.constant 1 : index - %c6 = arith.constant 6 : index - %c4 = arith.constant 4 : index - %c16 = arith.constant 16 : index - %c5 = arith.constant 5 : index - %c5_4 = arith.constant 5 : index - %c0 = arith.constant 0 : index - %c1_5 = arith.constant 1 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1, %arg18 = %c6, %arg19 = %c4) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16, %arg21 = %c16, %arg22 = %c1_5) { - %c2 = arith.constant 2 : index - %12 = arith.divui %arg13, %c2 : index - %13 = arith.muli %12, %c16 : index - %14 = arith.addi %13, %arg14 : index - %15 = arith.remui %arg13, %c2 : index - %16 = arith.muli %15, %c16 : index - %17 = arith.addi %16, %arg15 : index - %c1_140 = arith.constant 1 : index - %c1_141 = arith.constant 1 : index - %18 = arith.muli %14, %c1_140 : index - %19 = arith.muli %17, %c1_141 : index - %c24 = arith.constant 24 : index - %c24_142 = arith.constant 24 : index - %20 = arith.cmpi ult, %14, %c24 : index - %21 = arith.cmpi ult, %17, %c24_142 : index - %22 = arith.andi %20, %21 : i1 - %cst = arith.constant 0.000000e+00 : f32 - scf.if %22 { - %23 = scf.for %arg23 = %c0 to %c1_3 step %c1_5 iter_args(%arg24 = %cst) -> (f32) { - %26 = scf.for %arg25 = %c0 to %c5 step %c1_5 iter_args(%arg26 = %cst) -> (f32) { - %28 = scf.for %arg27 = %c0 to %c5_4 step %c1_5 iter_args(%arg28 = %cst) -> (f32) { - %30 = arith.addi %18, %arg25 : index - %31 = arith.addi %19, %arg27 : index - %32 = memref.load %0[%arg11, %arg23, %30, %31] : memref<1x1x28x28xf32> - %33 = memref.load %1[%arg12, %arg23, %arg25, %arg27] : memref<6x1x5x5xf32> - %34 = arith.mulf %32, %33 : f32 - %35 = arith.addf %arg28, %34 : f32 - scf.yield %35 : f32 - } - %29 = arith.addf %28, %arg26 : f32 - scf.yield %29 : f32 - } - %27 = arith.addf %26, %arg24 : f32 - scf.yield %27 : f32 - } - %24 = memref.load %2[%arg12] : memref<6xf32> - %25 = arith.addf %23, %24 : f32 - memref.store %25, %alloc[%arg11, %arg12, %14, %17] : memref<1x6x24x24xf32> - } - gpu.terminator - } - gpu.host_unregister %cast : memref<*xf32> - gpu.host_unregister %cast_0 : memref<*xf32> - gpu.host_unregister %cast_1 : memref<*xf32> - gpu.host_unregister %cast_2 : memref<*xf32> - %c0_6 = arith.constant 0 : index - %c1_7 = arith.constant 1 : index - %c512 = arith.constant 512 : index - %c3456 = arith.constant 3456 : index - %alloc_8 = memref.alloc() : memref<1xindex> - memref.store %c3456, %alloc_8[%c0_6] : memref<1xindex> - %reshape = memref.reshape %alloc(%alloc_8) : (memref<1x6x24x24xf32>, memref<1xindex>) -> memref<3456xf32> - %cast_9 = memref.cast %alloc : memref<1x6x24x24xf32> to memref<*xf32> - gpu.host_register %cast_9 : memref<*xf32> - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_7, %arg18 = %c1_7, %arg19 = %c1_7) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512, %arg21 = %c1_7, %arg22 = %c1_7) { - %cst = arith.constant 0.000000e+00 : f32 - scf.for %arg23 = %arg14 to %c3456 step %c512 { - %12 = memref.load %reshape[%arg23] : memref<3456xf32> - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %reshape[%arg23] : memref<3456xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_9 : memref<*xf32> - %alloc_10 = memref.alloc() : memref<1x6x24x24xf32> - memref.copy %alloc, %alloc_10 : memref<1x6x24x24xf32> to memref<1x6x24x24xf32> - %alloc_11 = memref.alloc() : memref<1x6x12x12xf32> - %cast_12 = memref.cast %alloc_10 : memref<1x6x24x24xf32> to memref<*xf32> - %cast_13 = memref.cast %alloc_11 : memref<1x6x12x12xf32> to memref<*xf32> - gpu.host_register %cast_12 : memref<*xf32> - gpu.host_register %cast_13 : memref<*xf32> - %c1_14 = arith.constant 1 : index - %c6_15 = arith.constant 6 : index - %c1_16 = arith.constant 1 : index - %c16_17 = arith.constant 16 : index - %c0_18 = arith.constant 0 : index - %c1_19 = arith.constant 1 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_14, %arg18 = %c6_15, %arg19 = %c1_16) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_17, %arg21 = %c16_17, %arg22 = %c1_19) { - %c1_140 = arith.constant 1 : index - %12 = arith.divui %arg13, %c1_140 : index - %13 = arith.muli %12, %c16_17 : index - %14 = arith.addi %13, %arg14 : index - %15 = arith.remui %arg13, %c1_140 : index - %16 = arith.muli %15, %c16_17 : index - %17 = arith.addi %16, %arg15 : index - %c2 = arith.constant 2 : index - %c2_141 = arith.constant 2 : index - %c2_142 = arith.constant 2 : index - %c2_143 = arith.constant 2 : index - %18 = arith.muli %14, %c2_142 : index - %19 = arith.muli %17, %c2_143 : index - %c12 = arith.constant 12 : index - %c12_144 = arith.constant 12 : index - %20 = arith.cmpi ult, %14, %c12 : index - %21 = arith.cmpi ult, %17, %c12_144 : index - %22 = arith.andi %20, %21 : i1 - scf.if %22 { - %23 = memref.load %alloc_10[%arg11, %arg12, %18, %19] : memref<1x6x24x24xf32> - %24 = scf.for %arg23 = %c0_18 to %c2 step %c1_19 iter_args(%arg24 = %23) -> (f32) { - %25 = scf.for %arg25 = %c0_18 to %c2_141 step %c1_19 iter_args(%arg26 = %23) -> (f32) { - %27 = arith.addi %18, %arg23 : index - %28 = arith.addi %19, %arg25 : index - %29 = memref.load %alloc_10[%arg11, %arg12, %27, %28] : memref<1x6x24x24xf32> - %30 = arith.maxnumf %arg26, %29 : f32 - scf.yield %30 : f32 - } - %26 = arith.maxnumf %25, %arg24 : f32 - scf.yield %26 : f32 - } - memref.store %24, %alloc_11[%arg11, %arg12, %14, %17] : memref<1x6x12x12xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_12 : memref<*xf32> - gpu.host_unregister %cast_13 : memref<*xf32> - %alloc_20 = memref.alloc() : memref<1x16x8x8xf32> - %cast_21 = memref.cast %alloc_11 : memref<1x6x12x12xf32> to memref<*xf32> - %cast_22 = memref.cast %3 : memref<16x6x5x5xf32> to memref<*xf32> - %cast_23 = memref.cast %4 : memref<16xf32> to memref<*xf32> - %cast_24 = memref.cast %alloc_20 : memref<1x16x8x8xf32> to memref<*xf32> - gpu.host_register %cast_21 : memref<*xf32> - gpu.host_register %cast_22 : memref<*xf32> - gpu.host_register %cast_23 : memref<*xf32> - gpu.host_register %cast_24 : memref<*xf32> - %c1_25 = arith.constant 1 : index - %c6_26 = arith.constant 6 : index - %c16_27 = arith.constant 16 : index - %c1_28 = arith.constant 1 : index - %c16_29 = arith.constant 16 : index - %c5_30 = arith.constant 5 : index - %c5_31 = arith.constant 5 : index - %c0_32 = arith.constant 0 : index - %c1_33 = arith.constant 1 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_25, %arg18 = %c16_27, %arg19 = %c1_28) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_29, %arg21 = %c16_29, %arg22 = %c1_33) { - %c1_140 = arith.constant 1 : index - %12 = arith.divui %arg13, %c1_140 : index - %13 = arith.muli %12, %c16_29 : index - %14 = arith.addi %13, %arg14 : index - %15 = arith.remui %arg13, %c1_140 : index - %16 = arith.muli %15, %c16_29 : index - %17 = arith.addi %16, %arg15 : index - %c1_141 = arith.constant 1 : index - %c1_142 = arith.constant 1 : index - %18 = arith.muli %14, %c1_141 : index - %19 = arith.muli %17, %c1_142 : index - %c8 = arith.constant 8 : index - %c8_143 = arith.constant 8 : index - %20 = arith.cmpi ult, %14, %c8 : index - %21 = arith.cmpi ult, %17, %c8_143 : index - %22 = arith.andi %20, %21 : i1 - %cst = arith.constant 0.000000e+00 : f32 - scf.if %22 { - %23 = scf.for %arg23 = %c0_32 to %c6_26 step %c1_33 iter_args(%arg24 = %cst) -> (f32) { - %26 = scf.for %arg25 = %c0_32 to %c5_30 step %c1_33 iter_args(%arg26 = %cst) -> (f32) { - %28 = scf.for %arg27 = %c0_32 to %c5_31 step %c1_33 iter_args(%arg28 = %cst) -> (f32) { - %30 = arith.addi %18, %arg25 : index - %31 = arith.addi %19, %arg27 : index - %32 = memref.load %alloc_11[%arg11, %arg23, %30, %31] : memref<1x6x12x12xf32> - %33 = memref.load %3[%arg12, %arg23, %arg25, %arg27] : memref<16x6x5x5xf32> - %34 = arith.mulf %32, %33 : f32 - %35 = arith.addf %arg28, %34 : f32 - scf.yield %35 : f32 - } - %29 = arith.addf %28, %arg26 : f32 - scf.yield %29 : f32 - } - %27 = arith.addf %26, %arg24 : f32 - scf.yield %27 : f32 - } - %24 = memref.load %4[%arg12] : memref<16xf32> - %25 = arith.addf %23, %24 : f32 - memref.store %25, %alloc_20[%arg11, %arg12, %14, %17] : memref<1x16x8x8xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_21 : memref<*xf32> - gpu.host_unregister %cast_22 : memref<*xf32> - gpu.host_unregister %cast_23 : memref<*xf32> - gpu.host_unregister %cast_24 : memref<*xf32> - %c0_34 = arith.constant 0 : index - %c1_35 = arith.constant 1 : index - %c512_36 = arith.constant 512 : index - %c1024 = arith.constant 1024 : index - %alloc_37 = memref.alloc() : memref<1xindex> - memref.store %c1024, %alloc_37[%c0_34] : memref<1xindex> - %reshape_38 = memref.reshape %alloc_20(%alloc_37) : (memref<1x16x8x8xf32>, memref<1xindex>) -> memref<1024xf32> - %cast_39 = memref.cast %alloc_20 : memref<1x16x8x8xf32> to memref<*xf32> - gpu.host_register %cast_39 : memref<*xf32> - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_35, %arg18 = %c1_35, %arg19 = %c1_35) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_36, %arg21 = %c1_35, %arg22 = %c1_35) { - %cst = arith.constant 0.000000e+00 : f32 - scf.for %arg23 = %arg14 to %c1024 step %c512_36 { - %12 = memref.load %reshape_38[%arg23] : memref<1024xf32> - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %reshape_38[%arg23] : memref<1024xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_39 : memref<*xf32> - %alloc_40 = memref.alloc() : memref<1x16x8x8xf32> - memref.copy %alloc_20, %alloc_40 : memref<1x16x8x8xf32> to memref<1x16x8x8xf32> - %alloc_41 = memref.alloc() : memref<1x16x4x4xf32> - %cast_42 = memref.cast %alloc_40 : memref<1x16x8x8xf32> to memref<*xf32> - %cast_43 = memref.cast %alloc_41 : memref<1x16x4x4xf32> to memref<*xf32> - gpu.host_register %cast_42 : memref<*xf32> - gpu.host_register %cast_43 : memref<*xf32> - %c1_44 = arith.constant 1 : index - %c16_45 = arith.constant 16 : index - %c1_46 = arith.constant 1 : index - %c16_47 = arith.constant 16 : index - %c0_48 = arith.constant 0 : index - %c1_49 = arith.constant 1 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_44, %arg18 = %c16_45, %arg19 = %c1_46) threads(%arg14, %arg15, %arg16) in (%arg20 = %c16_47, %arg21 = %c16_47, %arg22 = %c1_49) { - %c1_140 = arith.constant 1 : index - %12 = arith.divui %arg13, %c1_140 : index - %13 = arith.muli %12, %c16_47 : index - %14 = arith.addi %13, %arg14 : index - %15 = arith.remui %arg13, %c1_140 : index - %16 = arith.muli %15, %c16_47 : index - %17 = arith.addi %16, %arg15 : index - %c2 = arith.constant 2 : index - %c2_141 = arith.constant 2 : index - %c2_142 = arith.constant 2 : index - %c2_143 = arith.constant 2 : index - %18 = arith.muli %14, %c2_142 : index - %19 = arith.muli %17, %c2_143 : index - %c4_144 = arith.constant 4 : index - %c4_145 = arith.constant 4 : index - %20 = arith.cmpi ult, %14, %c4_144 : index - %21 = arith.cmpi ult, %17, %c4_145 : index - %22 = arith.andi %20, %21 : i1 - scf.if %22 { - %23 = memref.load %alloc_40[%arg11, %arg12, %18, %19] : memref<1x16x8x8xf32> - %24 = scf.for %arg23 = %c0_48 to %c2 step %c1_49 iter_args(%arg24 = %23) -> (f32) { - %25 = scf.for %arg25 = %c0_48 to %c2_141 step %c1_49 iter_args(%arg26 = %23) -> (f32) { - %27 = arith.addi %18, %arg23 : index - %28 = arith.addi %19, %arg25 : index - %29 = memref.load %alloc_40[%arg11, %arg12, %27, %28] : memref<1x16x8x8xf32> - %30 = arith.maxnumf %arg26, %29 : f32 - scf.yield %30 : f32 - } - %26 = arith.maxnumf %25, %arg24 : f32 - scf.yield %26 : f32 - } - memref.store %24, %alloc_41[%arg11, %arg12, %14, %17] : memref<1x16x4x4xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_42 : memref<*xf32> - gpu.host_unregister %cast_43 : memref<*xf32> - %alloc_50 = memref.alloc() : memref<2xindex> - %c0_51 = arith.constant 0 : index - %c1_52 = arith.constant 1 : index - memref.store %c1_52, %alloc_50[%c0_51] : memref<2xindex> - %c1_53 = arith.constant 1 : index - %c256 = arith.constant 256 : index - memref.store %c256, %alloc_50[%c1_53] : memref<2xindex> - %reshape_54 = memref.reshape %alloc_41(%alloc_50) : (memref<1x16x4x4xf32>, memref<2xindex>) -> memref<1x256xf32> - %c0_55 = arith.constant 0 : index - %c1_56 = arith.constant 1 : index - %c512_57 = arith.constant 512 : index - %c256_58 = arith.constant 256 : index - %c30720 = arith.constant 30720 : index - %c120 = arith.constant 120 : index - %alloc_59 = memref.alloc() : memref<1xindex> - %alloc_60 = memref.alloc() : memref<1xindex> - %alloc_61 = memref.alloc() : memref<1xindex> - memref.store %c256_58, %alloc_59[%c0_55] : memref<1xindex> - memref.store %c30720, %alloc_60[%c0_55] : memref<1xindex> - memref.store %c120, %alloc_61[%c0_55] : memref<1xindex> - %reshape_62 = memref.reshape %reshape_54(%alloc_59) : (memref<1x256xf32>, memref<1xindex>) -> memref<256xf32> - %reshape_63 = memref.reshape %6(%alloc_60) : (memref<256x120xf32>, memref<1xindex>) -> memref<30720xf32> - %reshape_64 = memref.reshape %5(%alloc_61) : (memref<120xf32>, memref<1xindex>) -> memref<120xf32> - %cast_65 = memref.cast %reshape_62 : memref<256xf32> to memref<*xf32> - gpu.host_register %cast_65 : memref<*xf32> - %cast_66 = memref.cast %reshape_63 : memref<30720xf32> to memref<*xf32> - gpu.host_register %cast_66 : memref<*xf32> - %cast_67 = memref.cast %reshape_64 : memref<120xf32> to memref<*xf32> - gpu.host_register %cast_67 : memref<*xf32> - %c1_68 = arith.constant 1 : index - %c120_69 = arith.constant 120 : index - %c256_70 = arith.constant 256 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_56, %arg18 = %c1_56, %arg19 = %c1_56) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_57, %arg21 = %c1_56, %arg22 = %c1_56) { - scf.for %arg23 = %arg14 to %c1_68 step %c512_57 { - scf.for %arg24 = %arg15 to %c120_69 step %c1_56 { - %cst = arith.constant 0.000000e+00 : f32 - %12 = scf.for %arg25 = %c0_55 to %c256_70 step %c1_56 iter_args(%arg26 = %cst) -> (f32) { - %19 = arith.muli %arg23, %c256_70 : index - %20 = arith.addi %19, %arg25 : index - %21 = memref.load %reshape_62[%20] : memref<256xf32> - %22 = arith.muli %arg25, %c120_69 : index - %23 = arith.addi %22, %arg24 : index - %24 = memref.load %reshape_63[%23] : memref<30720xf32> - %25 = arith.mulf %21, %24 : f32 - %26 = arith.addf %arg26, %25 : f32 - scf.yield %26 : f32 - } - %13 = arith.muli %arg23, %c120_69 : index - %14 = arith.addi %13, %arg24 : index - %15 = memref.load %reshape_64[%14] : memref<120xf32> - %16 = arith.addf %12, %15 : f32 - %17 = arith.muli %arg23, %c120_69 : index - %18 = arith.addi %17, %arg24 : index - memref.store %16, %reshape_64[%18] : memref<120xf32> - } - } - gpu.terminator - } - %alloc_71 = memref.alloc() : memref<1x120xf32> - %alloc_72 = memref.alloc() : memref<2xindex> - %c1_73 = arith.constant 1 : index - %c0_74 = arith.constant 0 : index - memref.store %c1_73, %alloc_72[%c0_74] : memref<2xindex> - %c120_75 = arith.constant 120 : index - %c1_76 = arith.constant 1 : index - memref.store %c120_75, %alloc_72[%c1_76] : memref<2xindex> - %reshape_77 = memref.reshape %5(%alloc_72) : (memref<120xf32>, memref<2xindex>) -> memref<1x120xf32> - memref.copy %reshape_77, %alloc_71 : memref<1x120xf32> to memref<1x120xf32> - %c0_78 = arith.constant 0 : index - %c1_79 = arith.constant 1 : index - %c512_80 = arith.constant 512 : index - %c120_81 = arith.constant 120 : index - %alloc_82 = memref.alloc() : memref<1xindex> - memref.store %c120_81, %alloc_82[%c0_78] : memref<1xindex> - %reshape_83 = memref.reshape %alloc_71(%alloc_82) : (memref<1x120xf32>, memref<1xindex>) -> memref<120xf32> - %cast_84 = memref.cast %alloc_71 : memref<1x120xf32> to memref<*xf32> - gpu.host_register %cast_84 : memref<*xf32> - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_79, %arg18 = %c1_79, %arg19 = %c1_79) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_80, %arg21 = %c1_79, %arg22 = %c1_79) { - %cst = arith.constant 0.000000e+00 : f32 - scf.for %arg23 = %arg14 to %c120_81 step %c512_80 { - %12 = memref.load %reshape_83[%arg23] : memref<120xf32> - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %reshape_83[%arg23] : memref<120xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_84 : memref<*xf32> - %alloc_85 = memref.alloc() : memref<1x120xf32> - memref.copy %alloc_71, %alloc_85 : memref<1x120xf32> to memref<1x120xf32> - %c0_86 = arith.constant 0 : index - %c1_87 = arith.constant 1 : index - %c512_88 = arith.constant 512 : index - %c120_89 = arith.constant 120 : index - %c10080 = arith.constant 10080 : index - %c84 = arith.constant 84 : index - %alloc_90 = memref.alloc() : memref<1xindex> - %alloc_91 = memref.alloc() : memref<1xindex> - %alloc_92 = memref.alloc() : memref<1xindex> - memref.store %c120_89, %alloc_90[%c0_86] : memref<1xindex> - memref.store %c10080, %alloc_91[%c0_86] : memref<1xindex> - memref.store %c84, %alloc_92[%c0_86] : memref<1xindex> - %reshape_93 = memref.reshape %alloc_85(%alloc_90) : (memref<1x120xf32>, memref<1xindex>) -> memref<120xf32> - %reshape_94 = memref.reshape %8(%alloc_91) : (memref<120x84xf32>, memref<1xindex>) -> memref<10080xf32> - %reshape_95 = memref.reshape %7(%alloc_92) : (memref<84xf32>, memref<1xindex>) -> memref<84xf32> - %cast_96 = memref.cast %reshape_93 : memref<120xf32> to memref<*xf32> - gpu.host_register %cast_96 : memref<*xf32> - %cast_97 = memref.cast %reshape_94 : memref<10080xf32> to memref<*xf32> - gpu.host_register %cast_97 : memref<*xf32> - %cast_98 = memref.cast %reshape_95 : memref<84xf32> to memref<*xf32> - gpu.host_register %cast_98 : memref<*xf32> - %c1_99 = arith.constant 1 : index - %c84_100 = arith.constant 84 : index - %c120_101 = arith.constant 120 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_87, %arg18 = %c1_87, %arg19 = %c1_87) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_88, %arg21 = %c1_87, %arg22 = %c1_87) { - scf.for %arg23 = %arg14 to %c1_99 step %c512_88 { - scf.for %arg24 = %arg15 to %c84_100 step %c1_87 { - %cst = arith.constant 0.000000e+00 : f32 - %12 = scf.for %arg25 = %c0_86 to %c120_101 step %c1_87 iter_args(%arg26 = %cst) -> (f32) { - %19 = arith.muli %arg23, %c120_101 : index - %20 = arith.addi %19, %arg25 : index - %21 = memref.load %reshape_93[%20] : memref<120xf32> - %22 = arith.muli %arg25, %c84_100 : index - %23 = arith.addi %22, %arg24 : index - %24 = memref.load %reshape_94[%23] : memref<10080xf32> - %25 = arith.mulf %21, %24 : f32 - %26 = arith.addf %arg26, %25 : f32 - scf.yield %26 : f32 - } - %13 = arith.muli %arg23, %c84_100 : index - %14 = arith.addi %13, %arg24 : index - %15 = memref.load %reshape_95[%14] : memref<84xf32> - %16 = arith.addf %12, %15 : f32 - %17 = arith.muli %arg23, %c84_100 : index - %18 = arith.addi %17, %arg24 : index - memref.store %16, %reshape_95[%18] : memref<84xf32> - } - } - gpu.terminator - } - %alloc_102 = memref.alloc() : memref<1x84xf32> - %alloc_103 = memref.alloc() : memref<2xindex> - %c1_104 = arith.constant 1 : index - %c0_105 = arith.constant 0 : index - memref.store %c1_104, %alloc_103[%c0_105] : memref<2xindex> - %c84_106 = arith.constant 84 : index - %c1_107 = arith.constant 1 : index - memref.store %c84_106, %alloc_103[%c1_107] : memref<2xindex> - %reshape_108 = memref.reshape %7(%alloc_103) : (memref<84xf32>, memref<2xindex>) -> memref<1x84xf32> - memref.copy %reshape_108, %alloc_102 : memref<1x84xf32> to memref<1x84xf32> - %c0_109 = arith.constant 0 : index - %c1_110 = arith.constant 1 : index - %c512_111 = arith.constant 512 : index - %c84_112 = arith.constant 84 : index - %alloc_113 = memref.alloc() : memref<1xindex> - memref.store %c84_112, %alloc_113[%c0_109] : memref<1xindex> - %reshape_114 = memref.reshape %alloc_102(%alloc_113) : (memref<1x84xf32>, memref<1xindex>) -> memref<84xf32> - %cast_115 = memref.cast %alloc_102 : memref<1x84xf32> to memref<*xf32> - gpu.host_register %cast_115 : memref<*xf32> - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_110, %arg18 = %c1_110, %arg19 = %c1_110) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_111, %arg21 = %c1_110, %arg22 = %c1_110) { - %cst = arith.constant 0.000000e+00 : f32 - scf.for %arg23 = %arg14 to %c84_112 step %c512_111 { - %12 = memref.load %reshape_114[%arg23] : memref<84xf32> - %13 = arith.maxnumf %12, %cst : f32 - memref.store %13, %reshape_114[%arg23] : memref<84xf32> - } - gpu.terminator - } - gpu.host_unregister %cast_115 : memref<*xf32> - %alloc_116 = memref.alloc() : memref<1x84xf32> - memref.copy %alloc_102, %alloc_116 : memref<1x84xf32> to memref<1x84xf32> - %c0_117 = arith.constant 0 : index - %c1_118 = arith.constant 1 : index - %c512_119 = arith.constant 512 : index - %c84_120 = arith.constant 84 : index - %c840 = arith.constant 840 : index - %c10 = arith.constant 10 : index - %alloc_121 = memref.alloc() : memref<1xindex> - %alloc_122 = memref.alloc() : memref<1xindex> - %alloc_123 = memref.alloc() : memref<1xindex> - memref.store %c84_120, %alloc_121[%c0_117] : memref<1xindex> - memref.store %c840, %alloc_122[%c0_117] : memref<1xindex> - memref.store %c10, %alloc_123[%c0_117] : memref<1xindex> - %reshape_124 = memref.reshape %alloc_116(%alloc_121) : (memref<1x84xf32>, memref<1xindex>) -> memref<84xf32> - %reshape_125 = memref.reshape %10(%alloc_122) : (memref<84x10xf32>, memref<1xindex>) -> memref<840xf32> - %reshape_126 = memref.reshape %9(%alloc_123) : (memref<10xf32>, memref<1xindex>) -> memref<10xf32> - %cast_127 = memref.cast %reshape_124 : memref<84xf32> to memref<*xf32> - gpu.host_register %cast_127 : memref<*xf32> - %cast_128 = memref.cast %reshape_125 : memref<840xf32> to memref<*xf32> - gpu.host_register %cast_128 : memref<*xf32> - %cast_129 = memref.cast %reshape_126 : memref<10xf32> to memref<*xf32> - gpu.host_register %cast_129 : memref<*xf32> - %c1_130 = arith.constant 1 : index - %c10_131 = arith.constant 10 : index - %c84_132 = arith.constant 84 : index - gpu.launch blocks(%arg11, %arg12, %arg13) in (%arg17 = %c1_118, %arg18 = %c1_118, %arg19 = %c1_118) threads(%arg14, %arg15, %arg16) in (%arg20 = %c512_119, %arg21 = %c1_118, %arg22 = %c1_118) { - scf.for %arg23 = %arg14 to %c1_130 step %c512_119 { - scf.for %arg24 = %arg15 to %c10_131 step %c1_118 { - %cst = arith.constant 0.000000e+00 : f32 - %12 = scf.for %arg25 = %c0_117 to %c84_132 step %c1_118 iter_args(%arg26 = %cst) -> (f32) { - %19 = arith.muli %arg23, %c84_132 : index - %20 = arith.addi %19, %arg25 : index - %21 = memref.load %reshape_124[%20] : memref<84xf32> - %22 = arith.muli %arg25, %c10_131 : index - %23 = arith.addi %22, %arg24 : index - %24 = memref.load %reshape_125[%23] : memref<840xf32> - %25 = arith.mulf %21, %24 : f32 - %26 = arith.addf %arg26, %25 : f32 - scf.yield %26 : f32 - } - %13 = arith.muli %arg23, %c10_131 : index - %14 = arith.addi %13, %arg24 : index - %15 = memref.load %reshape_126[%14] : memref<10xf32> - %16 = arith.addf %12, %15 : f32 - %17 = arith.muli %arg23, %c10_131 : index - %18 = arith.addi %17, %arg24 : index - memref.store %16, %reshape_126[%18] : memref<10xf32> - } - } - gpu.terminator - } - %alloc_133 = memref.alloc() : memref<1x10xf32> - %alloc_134 = memref.alloc() : memref<2xindex> - %c1_135 = arith.constant 1 : index - %c0_136 = arith.constant 0 : index - memref.store %c1_135, %alloc_134[%c0_136] : memref<2xindex> - %c10_137 = arith.constant 10 : index - %c1_138 = arith.constant 1 : index - memref.store %c10_137, %alloc_134[%c1_138] : memref<2xindex> - %reshape_139 = memref.reshape %9(%alloc_134) : (memref<10xf32>, memref<2xindex>) -> memref<1x10xf32> - memref.copy %reshape_139, %alloc_133 : memref<1x10xf32> to memref<1x10xf32> - %11 = bufferization.to_tensor %alloc_133 : memref<1x10xf32> - return %11 : tensor<1x10xf32> - } -} - diff --git a/thirdparty/mimalloc b/thirdparty/mimalloc deleted file mode 160000 index 81a771161e..0000000000 --- a/thirdparty/mimalloc +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 81a771161e37c8559c442fff099115cd1977db1e From 75445899dcf80fb394205c2ae65684924a116e47 Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Sun, 29 Sep 2024 13:46:05 +0000 Subject: [PATCH 21/29] standby --- examples/BuddyLeNet/buddy-lenet-import.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 95e76de253..cae6924b18 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -63,6 +63,8 @@ path_prefix = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) +with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file: + print(driver.subgraphs[0]._imported_module, file=module_file) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: print(driver.construct_main_graph(True), file=module_file) From 82b92f8507a7c22ed4921f790da2b1bfdfeb13db Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Tue, 15 Oct 2024 07:59:07 +0000 Subject: [PATCH 22/29] The GPU OP-enabled version --- examples/BuddyLeNet/CMakeLists.txt | 78 +++++- examples/BuddyLeNet/buddy-lenet-import.py | 14 +- examples/BuddyLeNet/makefile | 17 ++ examples/BuddyLeNet/matmul-cubin.mlir | 3 + examples/BuddyLeNet/transform.mlir | 311 ++++++++++++++++++++++ frontend/Python/graph/graph.py | 12 +- thirdparty/mimalloc | 1 + 7 files changed, 408 insertions(+), 28 deletions(-) create mode 100644 examples/BuddyLeNet/matmul-cubin.mlir create mode 100644 examples/BuddyLeNet/transform.mlir create mode 160000 thirdparty/mimalloc diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index 89c93a17d3..337c4342bd 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -1,5 +1,5 @@ add_custom_command( - OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data + OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files" ) @@ -17,43 +17,93 @@ add_custom_command( COMMENT "Building forward.o" VERBATIM) +# add_custom_command( +# OUTPUT subgraph0.ll +# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# --transform-preload-library -transform-library-paths="transform.mlir" +# --transform-interpreter -entry-point="codegen" | +# ${BUDDY_BINARY_DIR}/buddy-opt --pass-pipeline "builtin.module(func.func(nvgpu-optimize-shared-memory))" | +# ${BUDDY_BINARY_DIR}/buddy-opt +# -arith-expand +# -eliminate-empty-tensors +# -empty-tensor-to-alloc-tensor +# -linalg-bufferize +# -convert-linalg-to-affine-loops +# -affine-loop-fusion +# -affine-parallelize +# -lower-affine +# -canonicalize +# -func-bufferize +# -arith-bufferize +# -tensor-bufferize +# -buffer-deallocation +# -finalizing-bufferize +# -canonicalize +# -gpu-launch-sink-index-computations +# -canonicalize +# -legalize-shmem-outlining +# -canonicalize +# -convert-memcpy-to-gpu +# -gpu-async-region +# -canonicalize +# -convert-scf-to-cf +# -memref-expand +# -finalize-memref-to-llvm +# -convert-arith-to-llvm +# -convert-vector-to-llvm +# -convert-gpu-to-nvvm +# -has-redux=1 +# -llvm-request-c-wrappers +# -canonicalize +# -cse +# -sccp | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt +# --test-lower-to-nvvm -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll +# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# COMMENT "Building subgraph0.ll" +# VERBATIM +# ) + add_custom_command( OUTPUT subgraph0.ll COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt + ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll + ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir COMMENT "Building subgraph0.ll" VERBATIM) add_custom_command( OUTPUT subgraph0.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o + COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll COMMENT "Building subgraph0.o" VERBATIM) + + + # add_custom_command( # OUTPUT subgraph1.ll # COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | -# ${LLVM_MLIR_BINARY_DIR}/mlir-opt +# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt # -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | -# ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll +# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll # DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir # COMMENT "Building subgraph1.ll" # VERBATIM) # add_custom_command( # OUTPUT subgraph1.o -# COMMAND ${LLVM_MLIR_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o +# COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o # DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll # COMMENT "Building subgraph1.o" # VERBATIM) add_custom_command( OUTPUT subgraph1.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | ${BUDDY_BINARY_DIR}/buddy-opt -eliminate-empty-tensors @@ -77,11 +127,11 @@ add_custom_command( -convert-arith-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llvm-as | - ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o - DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir - COMMENT "Building subgraph0.o" + ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_TOOLS_BINARY_DIR}/llvm-as | + ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + COMMENT "Building subgraph1.o" VERBATIM) add_library(LENET STATIC subgraph0.o subgraph1.o forward.o) @@ -93,5 +143,5 @@ target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR}) set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES}) -set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS}) +set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES}) target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS}) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 903a8b095b..c878b3b163 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -75,21 +75,19 @@ with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) -with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file: - print(driver.subgraphs[0]._imported_module, file=module_file) with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file: print(driver.subgraphs[1]._imported_module, file=module_file) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: print(driver.construct_main_graph(True), file=module_file) -# params = dynamo_compiler.imported_params[graph] -# current_path = os.path.dirname(os.path.abspath(__file__)) +params = dynamo_compiler.imported_params[graph] +current_path = os.path.dirname(os.path.abspath(__file__)) -# float32_param = np.concatenate( -# [param.detach().numpy().reshape([-1]) for param in params] -# ) +float32_param = np.concatenate( + [param.detach().numpy().reshape([-1]) for param in params] +) -# float32_param.tofile(Path(current_path) / "arg0.data") +float32_param.tofile(Path(current_path) / "arg0.data") # # Convert the lenet graph to JSON string # json_str = graph.to_json() diff --git a/examples/BuddyLeNet/makefile b/examples/BuddyLeNet/makefile index fe87b6da1a..f29fcf0769 100644 --- a/examples/BuddyLeNet/makefile +++ b/examples/BuddyLeNet/makefile @@ -20,6 +20,22 @@ MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.dylib MTRIPLE := x86_64-apple-darwin endif +buddy-gpu-matmul-lower: + @${BUDDY_OPT} subgraph0.mlir \ + -transform-preload-library="transform-library-paths=transform.mlir" \ + -transform-interpreter="entry-point=codegen" \ + -o log.mlir + +buddy-gpu-matmul: + @${BUDDY_OPT} subgraph0.mlir -transform-preload-library="transform-library-paths=transform.mlir" -transform-interpreter="entry-point=codegen" | \ + ${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \ + ${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \ + ${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \ + ${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \ + ${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \ + ${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \ + ${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir + buddy-lenet-lower: @${BUDDY_OPT} ./fake-lenet.mlir \ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \ @@ -124,3 +140,4 @@ buddy-lenet-opt-run: -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + diff --git a/examples/BuddyLeNet/matmul-cubin.mlir b/examples/BuddyLeNet/matmul-cubin.mlir new file mode 100644 index 0000000000..0a1e515093 --- /dev/null +++ b/examples/BuddyLeNet/matmul-cubin.mlir @@ -0,0 +1,3 @@ +module { +} + diff --git a/examples/BuddyLeNet/transform.mlir b/examples/BuddyLeNet/transform.mlir new file mode 100644 index 0000000000..e2a02a9a97 --- /dev/null +++ b/examples/BuddyLeNet/transform.mlir @@ -0,0 +1,311 @@ +module attributes { transform.with_named_sequence } { + transform.named_sequence @codegen(%arg0: !transform.any_op) { + // Match the target operations and assign them to SSA values. + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 + : (!transform.any_op) -> !transform.any_op + %fill = transform.structured.match ops{["linalg.fill"]} in %arg0 + : (!transform.any_op) -> !transform.any_op + + // Perform tiling for the grid. + // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation + // strategy sets the tile size for grid-based partitioning to 128x256. + // This means that each [128, 2048] @ [2048, 256] matmul tile is computed within a GPU block, + // while multiple such blocks are computed in parallel across the grid. + // `tile_sizes` specify the dimensions of the tiled matmul result. + // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop. + // `%forall_op` is the `scf.forall` loop that maintains tile information. + %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul + tile_sizes [128, 256] (mapping = [#gpu.block, #gpu.block]) + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Perform canonicalization. + %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %1 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %1 : !transform.any_op + %all_loops = transform.structured.match interface{LoopLikeInterface} + in %arg0 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops : !transform.any_op + transform.apply_patterns to %1 { + transform.apply_patterns.linalg.tiling_canonicalization + } : !transform.any_op + + // Fuse the fill operation into the scf.all op. + %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Further tile the tiled matmul + // Tile the third dimension in matmul. + // [128, 2048] @ [2048, 256] matmul is further tiled into [128, 16] @ [16, 256] matmul. + %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_op [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Create pad op and prepare for mapping to GPU. + // Nothing has changed in the operation. + %padded, %pad, %copy = transform.structured.pad %tiled_linalg_op {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + // Rewrite tensor.pad into linalg.copy. + %3 = transform.get_producer_of_operand %padded[0] : (!transform.any_op) -> !transform.any_op + %4 = transform.get_producer_of_operand %padded[1] : (!transform.any_op) -> !transform.any_op + %5 = transform.get_producer_of_operand %padded[2] : (!transform.any_op) -> !transform.any_op + %6 = transform.structured.rewrite_in_destination_passing_style %3 : (!transform.any_op) -> !transform.any_op + %7 = transform.structured.rewrite_in_destination_passing_style %4 : (!transform.any_op) -> !transform.any_op + %8 = transform.structured.rewrite_in_destination_passing_style %5 : (!transform.any_op) -> !transform.any_op + + // Tile the linalg.copy op and map it to GPU thread level, + // such that the tiled matrix are copied to GPU shared memory. + // num_threads is different from tile_sizes used above, + // as it specifies the number of tile instead of the size of the tile. + // The first transform tile the [128, 16] into [4, 4], + // and the second transform tile the [16, 256] into [2, 16]. + %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %6 num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %tiled_op_2, %forall_op_3 = transform.structured.tile_using_forall %7 num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile the linalg.matmul op and map it to GPU warp level. + %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %padded num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + // Tile the linalg.fill op and map it to GPU warp level. + %tiled_op_6, %forall_op_7 = transform.structured.tile_using_forall %fused_op num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Perform canonicalization. + %9 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %9 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %9 : !transform.any_op + %all_loops_2 = transform.structured.match interface{LoopLikeInterface} + in %9 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_2 : !transform.any_op + transform.apply_patterns to %9 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // Perform vectorization. + // Vectorize the linalg.copy, linalg.fill, and linalg.matmul operations. + %10 = transform.structured.vectorize_children_and_apply_patterns %9 : (!transform.any_op) -> !transform.any_op + + // Perform canonicalization. + transform.apply_patterns to %10 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %10 : !transform.any_op + %all_loops_3 = transform.structured.match interface{LoopLikeInterface} + in %10 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_3 : !transform.any_op + transform.apply_patterns to %10 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // Match bufferization.alloc_tensors inside the forall op + %scf_forall = transform.structured.match ops{["scf.forall"]} attributes{mapping = [#gpu.block, #gpu.block]} in %arg0 : (!transform.any_op) -> !transform.any_op + %alloc_tensor_ops = transform.structured.match ops{["bufferization.alloc_tensor"]} in %scf_forall : (!transform.any_op) -> !transform.any_op + + // Bufferize the alloc_tensor ops to memref.alloc ops. + // The memory_space attribute for GPU Dialect 0 means global memory, 3 means workgroup memory address, 5 means private memory address. + // According to https://discourse.llvm.org/t/rfc-memref-memory-shape-as-attribute/2229 + %buffer, %new_ops = transform.structured.bufferize_to_allocation %alloc_tensor_ops {memory_space = 3 } : !transform.any_op + + // Eliminate empty tensors and erase unnecessary inputs. + transform.structured.eliminate_empty_tensors %arg0 : !transform.any_op + %func_eras = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func_eras { + transform.apply_patterns.linalg.erase_unnecessary_inputs + } : !transform.any_op + + // Bufferize the remaining operations in one time. + %11 = transform.bufferization.one_shot_bufferize %arg0 { bufferize_function_boundaries = true, function_boundary_type_conversion = 1 : i32} : (!transform.any_op) -> !transform.any_op + + // Erase dead alloc and stores. + %12 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op + transform.memref.erase_dead_alloc_and_stores %12 : (!transform.any_op) -> () + + // Generate GPU launch. + %13 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op + %gpu_launch = transform.gpu.map_forall_to_blocks %13 { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op + + // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute. + %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op + + %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op + + // Removes unnecessary GPU barriers from the function. + // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op + + // Perform canonicalization. + transform.apply_patterns to %15 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %15 : !transform.any_op + %all_loops_4 = transform.structured.match interface{LoopLikeInterface} + in %15 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_4 : !transform.any_op + transform.apply_patterns to %15 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // Identify static memory allocations within the given region, + // and move them to a higher level (hoisting). + transform.buddy.hoist_static_alloc %15 : (!transform.any_op) -> () + + // Collects patterns for folding memref aliasing ops (memref.subview) into consumer load/store ops (affine.load, memref.load, nvgpu.ldmatrix, vector.load, vector.transfer_read, affine.store, memref.store, etc.) and other ops (e.g., memref.subview). + transform.apply_patterns to %15 { + transform.apply_patterns.memref.fold_memref_alias_ops + } : !transform.any_op + // Collects patterns for extracting address computations from operations with memory accesses such that these memory accesses use only a base pointer. + transform.apply_patterns to %15 { + transform.apply_patterns.memref.extract_address_computations + } : !transform.any_op + // Perform canonicalization. + transform.apply_patterns to %15 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %15 : !transform.any_op + %all_loops_5 = transform.structured.match interface{LoopLikeInterface} + in %15 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_5 : !transform.any_op + transform.apply_patterns to %15 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // Adds patterns that unroll vectors to a native tile size for GPUs with mma operations + transform.apply_patterns to %15 { + transform.apply_patterns.buddy.unroll_vectors_gpu_mma_sync + } : !transform.any_op + + // Insert a gpu.barrier after a given scf.for loop + %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for"> + // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> () + + + transform.apply_patterns to %15 { + transform.apply_patterns.memref.fold_memref_alias_ops + } : !transform.any_op + transform.apply_cse to %15 : !transform.any_op + + // Hoist vector.transfer_read / vector.transfer_write pairs out of immediately enclosing scf::ForOp iteratively + // Warning: Deprecated + %17 = transform.structured.hoist_redundant_vector_transfers %15 : (!transform.any_op) -> !transform.any_op + + // Perform canonicalization. + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %17 : !transform.any_op + %all_loops_6 = transform.structured.match interface{LoopLikeInterface} + in %17 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_6 : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // This converts slices of operations containing vector.contract op into + // mma operations, targetting warp level tensorcore operations. + transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> () + + // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op + + // Perform canonicalization. + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %17 : !transform.any_op + %all_loops_7 = transform.structured.match interface{LoopLikeInterface} + in %17 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_7 : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op + %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc"> + + // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations. + transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op + + transform.apply_patterns to %17 { + transform.apply_patterns.vector.transfer_to_scf full_unroll = true + } : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %17 : !transform.any_op + %all_loops_8 = transform.structured.match interface{LoopLikeInterface} + in %17 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_8 : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.vector.lower_masked_transfers + } : !transform.any_op + + // Convert sync copies to shared memory to async. + // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> () + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + transform.apply_patterns.memref.fold_memref_alias_ops + } : !transform.any_op + %all_loops_9 = transform.structured.match interface{LoopLikeInterface} + in %17 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_9 : !transform.any_op + transform.apply_cse to %17 : !transform.any_op + + + %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op + %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op + // This applies software pipelining to a given scf.for loop. + // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop. + // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op + + // Perform canonicalization. + transform.apply_patterns to %17 { + transform.apply_patterns.vector.lower_masks + } : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.vector.materialize_masks + } : !transform.any_op + transform.apply_patterns to %17 { + transform.apply_patterns.linalg.tiling_canonicalization + transform.apply_patterns.scf.for_loop_canonicalization + transform.apply_patterns.canonicalization + transform.apply_patterns.memref.fold_memref_alias_ops + } : !transform.any_op + + %all_loops_10 = transform.structured.match interface{LoopLikeInterface} + in %17 + : (!transform.any_op) -> !transform.any_op + transform.apply_licm to %all_loops_10 : !transform.any_op + transform.apply_cse to %17 : !transform.any_op + + transform.yield + } +} // module diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 2f98949d7d..6a18f8b80d 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -186,16 +186,16 @@ def init_op_group(self): # self.op_groups[subgraph_name] = group group = [] for i, op in enumerate(self._body): - if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i==18 or i==21 or i==24: + if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25: continue group.append(op) - subgraph_name = "subgraph0" - self.group_map_device[subgraph_name] = DeviceType.GPU - self.op_groups[subgraph_name] = group - - new_group = [self._body[18], self._body[21], self._body[24]] subgraph_name = "subgraph1" self.group_map_device[subgraph_name] = DeviceType.CPU + self.op_groups[subgraph_name] = group + + new_group = [self._body[25]] + subgraph_name = "subgraph0" + self.group_map_device[subgraph_name] = DeviceType.GPU self.op_groups[subgraph_name] = new_group def fuse_ops(self, pattern_list: List[FunctionType]): diff --git a/thirdparty/mimalloc b/thirdparty/mimalloc new file mode 160000 index 0000000000..81a771161e --- /dev/null +++ b/thirdparty/mimalloc @@ -0,0 +1 @@ +Subproject commit 81a771161e37c8559c442fff099115cd1977db1e From a8569cce3babf02de103eb43b85644fbaae3ceaf Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Tue, 29 Oct 2024 04:34:48 +0000 Subject: [PATCH 23/29] Separate for heterogeneous demo --- examples/BuddyLeNet/CMakeLists.txt | 146 ++-- examples/BuddyLeNet/matmul-cubin.mlir | 3 - examples/BuddyLeNet/transform.mlir | 311 -------- examples/BuddyTest/.gitignore | 3 - examples/BuddyTest/CMakeLists.txt | 29 - examples/BuddyTest/README.md | 65 -- examples/BuddyTest/import-test.py | 55 -- examples/BuddyTest/makefile | 56 -- examples/BuddyTest/model.py | 37 - examples/BuddyTest/test-main.cpp | 115 --- examples/CMakeLists.txt | 4 - frontend/Python/frontend.py | 4 - frontend/Python/graph/graph.py | 53 +- frontend/Python/graph/graph_driver.py | 3 - frontend/Python/graph/json_decoder.py | 2 - frontend/Python/ops/gpu.py | 729 ------------------ .../Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp | 133 +--- tests/Conversion/convert-memcpy-to-gpu.mlir | 229 +++++- 18 files changed, 333 insertions(+), 1644 deletions(-) delete mode 100644 examples/BuddyLeNet/matmul-cubin.mlir delete mode 100644 examples/BuddyLeNet/transform.mlir delete mode 100644 examples/BuddyTest/.gitignore delete mode 100644 examples/BuddyTest/CMakeLists.txt delete mode 100644 examples/BuddyTest/README.md delete mode 100644 examples/BuddyTest/import-test.py delete mode 100644 examples/BuddyTest/makefile delete mode 100644 examples/BuddyTest/model.py delete mode 100644 examples/BuddyTest/test-main.cpp delete mode 100644 frontend/Python/ops/gpu.py diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index 337c4342bd..6e9cfe1204 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -1,7 +1,7 @@ add_custom_command( OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/arg0.data COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/buddy-lenet-import.py - COMMENT "Generating forward.mlir, subgraph1.mlir and parameter files" + COMMENT "Generating forward.mlir, subgraph0.mlir, subgraph1.mlir and parameter files" ) add_custom_command( @@ -17,93 +17,9 @@ add_custom_command( COMMENT "Building forward.o" VERBATIM) -# add_custom_command( -# OUTPUT subgraph0.ll -# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# --transform-preload-library -transform-library-paths="transform.mlir" -# --transform-interpreter -entry-point="codegen" | -# ${BUDDY_BINARY_DIR}/buddy-opt --pass-pipeline "builtin.module(func.func(nvgpu-optimize-shared-memory))" | -# ${BUDDY_BINARY_DIR}/buddy-opt -# -arith-expand -# -eliminate-empty-tensors -# -empty-tensor-to-alloc-tensor -# -linalg-bufferize -# -convert-linalg-to-affine-loops -# -affine-loop-fusion -# -affine-parallelize -# -lower-affine -# -canonicalize -# -func-bufferize -# -arith-bufferize -# -tensor-bufferize -# -buffer-deallocation -# -finalizing-bufferize -# -canonicalize -# -gpu-launch-sink-index-computations -# -canonicalize -# -legalize-shmem-outlining -# -canonicalize -# -convert-memcpy-to-gpu -# -gpu-async-region -# -canonicalize -# -convert-scf-to-cf -# -memref-expand -# -finalize-memref-to-llvm -# -convert-arith-to-llvm -# -convert-vector-to-llvm -# -convert-gpu-to-nvvm -# -has-redux=1 -# -llvm-request-c-wrappers -# -canonicalize -# -cse -# -sccp | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -# --test-lower-to-nvvm -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# COMMENT "Building subgraph0.ll" -# VERBATIM -# ) - -add_custom_command( - OUTPUT subgraph0.ll - COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | - ${LLVM_TOOLS_BINARY_DIR}/mlir-opt - -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | - ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll - DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir - COMMENT "Building subgraph0.ll" - VERBATIM) - add_custom_command( OUTPUT subgraph0.o - COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o - DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.ll - COMMENT "Building subgraph0.o" - VERBATIM) - - - - -# add_custom_command( -# OUTPUT subgraph1.ll -# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -# -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -# COMMENT "Building subgraph1.ll" -# VERBATIM) - -# add_custom_command( -# OUTPUT subgraph1.o -# COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o -# DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -# COMMENT "Building subgraph1.o" -# VERBATIM) - -add_custom_command( - OUTPUT subgraph1.o - COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | ${BUDDY_BINARY_DIR}/buddy-opt -eliminate-empty-tensors @@ -129,11 +45,65 @@ add_custom_command( -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | ${LLVM_TOOLS_BINARY_DIR}/llvm-as | + ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir + COMMENT "Building subgraph0.o" + VERBATIM) + +# new +set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map") +set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin") +add_custom_command( + OUTPUT subgraph1.o + COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | + ${LLVM_TOOLS_BINARY_DIR}/mlir-opt + -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION} + -buffer-deallocation + -convert-linalg-to-parallel-loops + -canonicalize + -gpu-map-parallel-loops + -convert-parallel-loops-to-gpu + -gpu-kernel-outlining + -canonicalize + -cse | + ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize | + ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} | + ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_TOOLS_BINARY_DIR}/llvm-as | ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir COMMENT "Building subgraph1.o" VERBATIM) +# add_library(LENET_GPU STATIC subgraph0_gpu.o forward.o) + +# SET_TARGET_PROPERTIES(LENET_GPU PROPERTIES LINKER_LANGUAGE C) + +# add_executable(buddy-lenet-run-gpu buddy-lenet-main.cpp) +# target_link_directories(buddy-lenet-run-gpu PRIVATE ${LLVM_LIBRARY_DIR}) + +# set(BUDDY_LENET_LIBS_GPU LENET_GPU mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime ${PNG_LIBRARIES}) + +# target_link_libraries(buddy-lenet-run-gpu ${BUDDY_LENET_LIBS_GPU}) + +# add_custom_command( +# OUTPUT subgraph1.ll +# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt +# -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll +# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir +# COMMENT "Building subgraph1.ll" +# VERBATIM) + +# add_custom_command( +# OUTPUT subgraph1.o +# COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o +# DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll +# COMMENT "Building subgraph1.o" +# VERBATIM) + add_library(LENET STATIC subgraph0.o subgraph1.o forward.o) SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) @@ -141,7 +111,5 @@ SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C) add_executable(buddy-lenet-run buddy-lenet-main.cpp) target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR}) -set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES}) - -set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES}) +set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime BuddyLibDIP ${PNG_LIBRARIES}) target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS}) diff --git a/examples/BuddyLeNet/matmul-cubin.mlir b/examples/BuddyLeNet/matmul-cubin.mlir deleted file mode 100644 index 0a1e515093..0000000000 --- a/examples/BuddyLeNet/matmul-cubin.mlir +++ /dev/null @@ -1,3 +0,0 @@ -module { -} - diff --git a/examples/BuddyLeNet/transform.mlir b/examples/BuddyLeNet/transform.mlir deleted file mode 100644 index e2a02a9a97..0000000000 --- a/examples/BuddyLeNet/transform.mlir +++ /dev/null @@ -1,311 +0,0 @@ -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen(%arg0: !transform.any_op) { - // Match the target operations and assign them to SSA values. - %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 - : (!transform.any_op) -> !transform.any_op - %fill = transform.structured.match ops{["linalg.fill"]} in %arg0 - : (!transform.any_op) -> !transform.any_op - - // Perform tiling for the grid. - // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation - // strategy sets the tile size for grid-based partitioning to 128x256. - // This means that each [128, 2048] @ [2048, 256] matmul tile is computed within a GPU block, - // while multiple such blocks are computed in parallel across the grid. - // `tile_sizes` specify the dimensions of the tiled matmul result. - // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop. - // `%forall_op` is the `scf.forall` loop that maintains tile information. - %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul - tile_sizes [128, 256] (mapping = [#gpu.block, #gpu.block]) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Perform canonicalization. - %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %1 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %1 : !transform.any_op - %all_loops = transform.structured.match interface{LoopLikeInterface} - in %arg0 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops : !transform.any_op - transform.apply_patterns to %1 { - transform.apply_patterns.linalg.tiling_canonicalization - } : !transform.any_op - - // Fuse the fill operation into the scf.all op. - %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Further tile the tiled matmul - // Tile the third dimension in matmul. - // [128, 2048] @ [2048, 256] matmul is further tiled into [128, 16] @ [16, 256] matmul. - %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_op [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Create pad op and prepare for mapping to GPU. - // Nothing has changed in the operation. - %padded, %pad, %copy = transform.structured.pad %tiled_linalg_op {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - - // Rewrite tensor.pad into linalg.copy. - %3 = transform.get_producer_of_operand %padded[0] : (!transform.any_op) -> !transform.any_op - %4 = transform.get_producer_of_operand %padded[1] : (!transform.any_op) -> !transform.any_op - %5 = transform.get_producer_of_operand %padded[2] : (!transform.any_op) -> !transform.any_op - %6 = transform.structured.rewrite_in_destination_passing_style %3 : (!transform.any_op) -> !transform.any_op - %7 = transform.structured.rewrite_in_destination_passing_style %4 : (!transform.any_op) -> !transform.any_op - %8 = transform.structured.rewrite_in_destination_passing_style %5 : (!transform.any_op) -> !transform.any_op - - // Tile the linalg.copy op and map it to GPU thread level, - // such that the tiled matrix are copied to GPU shared memory. - // num_threads is different from tile_sizes used above, - // as it specifies the number of tile instead of the size of the tile. - // The first transform tile the [128, 16] into [4, 4], - // and the second transform tile the [16, 256] into [2, 16]. - %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %6 num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %tiled_op_2, %forall_op_3 = transform.structured.tile_using_forall %7 num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Tile the linalg.matmul op and map it to GPU warp level. - %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %padded num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Tile the linalg.fill op and map it to GPU warp level. - %tiled_op_6, %forall_op_7 = transform.structured.tile_using_forall %fused_op num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Perform canonicalization. - %9 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %9 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %9 : !transform.any_op - %all_loops_2 = transform.structured.match interface{LoopLikeInterface} - in %9 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_2 : !transform.any_op - transform.apply_patterns to %9 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // Perform vectorization. - // Vectorize the linalg.copy, linalg.fill, and linalg.matmul operations. - %10 = transform.structured.vectorize_children_and_apply_patterns %9 : (!transform.any_op) -> !transform.any_op - - // Perform canonicalization. - transform.apply_patterns to %10 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %10 : !transform.any_op - %all_loops_3 = transform.structured.match interface{LoopLikeInterface} - in %10 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_3 : !transform.any_op - transform.apply_patterns to %10 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // Match bufferization.alloc_tensors inside the forall op - %scf_forall = transform.structured.match ops{["scf.forall"]} attributes{mapping = [#gpu.block, #gpu.block]} in %arg0 : (!transform.any_op) -> !transform.any_op - %alloc_tensor_ops = transform.structured.match ops{["bufferization.alloc_tensor"]} in %scf_forall : (!transform.any_op) -> !transform.any_op - - // Bufferize the alloc_tensor ops to memref.alloc ops. - // The memory_space attribute for GPU Dialect 0 means global memory, 3 means workgroup memory address, 5 means private memory address. - // According to https://discourse.llvm.org/t/rfc-memref-memory-shape-as-attribute/2229 - %buffer, %new_ops = transform.structured.bufferize_to_allocation %alloc_tensor_ops {memory_space = 3 } : !transform.any_op - - // Eliminate empty tensors and erase unnecessary inputs. - transform.structured.eliminate_empty_tensors %arg0 : !transform.any_op - %func_eras = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_eras { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - - // Bufferize the remaining operations in one time. - %11 = transform.bufferization.one_shot_bufferize %arg0 { bufferize_function_boundaries = true, function_boundary_type_conversion = 1 : i32} : (!transform.any_op) -> !transform.any_op - - // Erase dead alloc and stores. - %12 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op - transform.memref.erase_dead_alloc_and_stores %12 : (!transform.any_op) -> () - - // Generate GPU launch. - %13 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op - %gpu_launch = transform.gpu.map_forall_to_blocks %13 { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op - - // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute. - %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op - - %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op - - // Removes unnecessary GPU barriers from the function. - // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op - - // Perform canonicalization. - transform.apply_patterns to %15 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %15 : !transform.any_op - %all_loops_4 = transform.structured.match interface{LoopLikeInterface} - in %15 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_4 : !transform.any_op - transform.apply_patterns to %15 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // Identify static memory allocations within the given region, - // and move them to a higher level (hoisting). - transform.buddy.hoist_static_alloc %15 : (!transform.any_op) -> () - - // Collects patterns for folding memref aliasing ops (memref.subview) into consumer load/store ops (affine.load, memref.load, nvgpu.ldmatrix, vector.load, vector.transfer_read, affine.store, memref.store, etc.) and other ops (e.g., memref.subview). - transform.apply_patterns to %15 { - transform.apply_patterns.memref.fold_memref_alias_ops - } : !transform.any_op - // Collects patterns for extracting address computations from operations with memory accesses such that these memory accesses use only a base pointer. - transform.apply_patterns to %15 { - transform.apply_patterns.memref.extract_address_computations - } : !transform.any_op - // Perform canonicalization. - transform.apply_patterns to %15 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %15 : !transform.any_op - %all_loops_5 = transform.structured.match interface{LoopLikeInterface} - in %15 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_5 : !transform.any_op - transform.apply_patterns to %15 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // Adds patterns that unroll vectors to a native tile size for GPUs with mma operations - transform.apply_patterns to %15 { - transform.apply_patterns.buddy.unroll_vectors_gpu_mma_sync - } : !transform.any_op - - // Insert a gpu.barrier after a given scf.for loop - %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for"> - // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> () - - - transform.apply_patterns to %15 { - transform.apply_patterns.memref.fold_memref_alias_ops - } : !transform.any_op - transform.apply_cse to %15 : !transform.any_op - - // Hoist vector.transfer_read / vector.transfer_write pairs out of immediately enclosing scf::ForOp iteratively - // Warning: Deprecated - %17 = transform.structured.hoist_redundant_vector_transfers %15 : (!transform.any_op) -> !transform.any_op - - // Perform canonicalization. - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %17 : !transform.any_op - %all_loops_6 = transform.structured.match interface{LoopLikeInterface} - in %17 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_6 : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // This converts slices of operations containing vector.contract op into - // mma operations, targetting warp level tensorcore operations. - transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> () - - // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op - - // Perform canonicalization. - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %17 : !transform.any_op - %all_loops_7 = transform.structured.match interface{LoopLikeInterface} - in %17 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_7 : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op - %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc"> - - // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations. - transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op - - transform.apply_patterns to %17 { - transform.apply_patterns.vector.transfer_to_scf full_unroll = true - } : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %17 : !transform.any_op - %all_loops_8 = transform.structured.match interface{LoopLikeInterface} - in %17 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_8 : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.vector.lower_masked_transfers - } : !transform.any_op - - // Convert sync copies to shared memory to async. - // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> () - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - transform.apply_patterns.memref.fold_memref_alias_ops - } : !transform.any_op - %all_loops_9 = transform.structured.match interface{LoopLikeInterface} - in %17 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_9 : !transform.any_op - transform.apply_cse to %17 : !transform.any_op - - - %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op - %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op - // This applies software pipelining to a given scf.for loop. - // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop. - // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op - - // Perform canonicalization. - transform.apply_patterns to %17 { - transform.apply_patterns.vector.lower_masks - } : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.vector.materialize_masks - } : !transform.any_op - transform.apply_patterns to %17 { - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - transform.apply_patterns.memref.fold_memref_alias_ops - } : !transform.any_op - - %all_loops_10 = transform.structured.match interface{LoopLikeInterface} - in %17 - : (!transform.any_op) -> !transform.any_op - transform.apply_licm to %all_loops_10 : !transform.any_op - transform.apply_cse to %17 : !transform.any_op - - transform.yield - } -} // module diff --git a/examples/BuddyTest/.gitignore b/examples/BuddyTest/.gitignore deleted file mode 100644 index 081f173509..0000000000 --- a/examples/BuddyTest/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -__pycache__ -*.mlir -log.ll diff --git a/examples/BuddyTest/CMakeLists.txt b/examples/BuddyTest/CMakeLists.txt deleted file mode 100644 index 8039bfcc15..0000000000 --- a/examples/BuddyTest/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -add_custom_command( - OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir - COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyTest/import-test.py - COMMENT "Generating forward.mlir" -) - - -add_custom_command( - OUTPUT forward.o - COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | - ${LLVM_MLIR_BINARY_DIR}/mlir-opt - -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llvm-as | - ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyTest/forward.o - DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyTest/forward.mlir - COMMENT "Building forward.o" - VERBATIM) - - -add_library(TEST STATIC forward.o) - -SET_TARGET_PROPERTIES(TEST PROPERTIES LINKER_LANGUAGE C) - -add_executable(buddy-test-run test-main.cpp) -target_link_directories(buddy-test-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) - -set(BUDDY_TEST_LIBS TEST mlir_runner_utils mlir_cuda_runtime) -target_link_libraries(buddy-test-run ${BUDDY_TEST_LIBS}) diff --git a/examples/BuddyTest/README.md b/examples/BuddyTest/README.md deleted file mode 100644 index f057723bb3..0000000000 --- a/examples/BuddyTest/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Buddy Compiler Test Example - -0. Activate your python environment. - -1. Build LLVM/MLIR - -```bash -$ cd buddy-mlir -$ mkdir llvm/build -$ cd llvm/build -$ cmake -G Ninja ../llvm \ - -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \ - -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \ - -DMLIR_ENABLE_CUDA_RUNNER=ON \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DOPENMP_ENABLE_LIBOMPTARGET=OFF \ - -DCMAKE_BUILD_TYPE=RELEASE \ - -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ - -DPython3_EXECUTABLE=$(which python3) -$ ninja check-clang check-mlir omp -``` - -2. Build buddy-mlir - -```bash -$ mkdir build && cd build -$ cmake -G Ninja .. \ - -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \ - -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DCMAKE_BUILD_TYPE=RELEASE \ - -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \ - -DPython3_EXECUTABLE=$(which python3) -$ ninja -$ ninja check-buddy -``` - -3. Set the `PYTHONPATH` environment variable. - -Make sure you are in the build directory. - -```bash -$ export BUDDY_MLIR_BUILD_DIR=$PWD -$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build -$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH} -``` - -4. Build and run the Test example - -```bash -$ cmake -G Ninja .. -DBUDDY_TEST_EXAMPLES=ON -$ ninja buddy-test-run -$ cd bin -$ ./buddy-test-run -``` - -## Debug the Lowering Pass Pipeline with Fake Parameters. - -```bash -$ cd buddy-mlir -$ cd examples/BuddyTest -$ make gpu-test-lower -$ make gpu-test-translate -$ make gpu-test-run -``` diff --git a/examples/BuddyTest/import-test.py b/examples/BuddyTest/import-test.py deleted file mode 100644 index 79620d9d44..0000000000 --- a/examples/BuddyTest/import-test.py +++ /dev/null @@ -1,55 +0,0 @@ -# ===- buddy-lenet-import.py --------------------------------------------------- -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ===--------------------------------------------------------------------------- -# -# This is the Test model AOT importer. -# -# ===--------------------------------------------------------------------------- - -import os -from pathlib import Path - -import numpy as np -import torch -from torch._inductor.decomposition import decompositions as inductor_decomp - -from buddy.compiler.frontend import DynamoCompiler -from buddy.compiler.graph import GraphDriver -from buddy.compiler.graph.transform import simply_fuse -from buddy.compiler.ops.gpu import ops_registry as gpu_ops_registry -from model import TestModule - -model = TestModule() -model = model.eval() - -# Initialize Dynamo Compiler with specific configurations as an importer. -dynamo_compiler = DynamoCompiler( - primary_registry=gpu_ops_registry, - aot_autograd_decomposition=inductor_decomp, -) - -data = torch.randn([1, 1, 12, 10]) -# Import the model into MLIR module and parameters. -with torch.no_grad(): - graphs = dynamo_compiler.importer(model, data) - -assert len(graphs) == 1 -graph = graphs[0] -print(graph.body) -graph.lower_to_top_level_ir() -path_prefix = os.path.dirname(os.path.abspath(__file__)) -with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: - print(graph._imported_module, file=module_file) - \ No newline at end of file diff --git a/examples/BuddyTest/makefile b/examples/BuddyTest/makefile deleted file mode 100644 index 9c4c2e4a0c..0000000000 --- a/examples/BuddyTest/makefile +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -BUDDY_OPT := ../../build/bin/buddy-opt -MLIR_OPT := ../../llvm/build/bin/mlir-opt -MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate -MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner -LLC := ../../llvm/build/bin/llc -OPT_FLAG := -O0 - -ifeq ($(shell uname),Linux) -MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so -MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so -MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so -MLIR_CUDA_RUNTIME := ../../llvm/build/lib/libmlir_cuda_runtime.so -MTRIPLE := x86_64-unknown-linux-gnu -else ifeq ($(shell uname),Darwin) -MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib -MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib -MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib -MTRIPLE := x86_64-apple-darwin -endif - -gpu-test-lower: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ - ${MLIR_OPT} -o log.mlir - -gpu-test-translate: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ - ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll - -gpu-test-run: - @${MLIR_OPT} forward.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ - ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} - -gpu-conv2d-lower: - @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ - ${MLIR_OPT} -o log.mlir - -gpu-conv2d-translate: - @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, gpu-module-to-binary)" | \ - ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll - -gpu-conv2d-run: - @${MLIR_OPT} conv2d.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm | \ - ${MLIR_OPT} -pass-pipeline="builtin.module(nvvm-attach-target{chip=sm_70 O=3},\ - gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | \ - ${MLIR_CPU_RUNNER} -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_CUDA_RUNTIME} diff --git a/examples/BuddyTest/model.py b/examples/BuddyTest/model.py deleted file mode 100644 index d72af61c95..0000000000 --- a/examples/BuddyTest/model.py +++ /dev/null @@ -1,37 +0,0 @@ -# ===- model.py ---------------------------------------------------------------- -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ===--------------------------------------------------------------------------- -# -# Test model definition. -# -# ===--------------------------------------------------------------------------- - -import torch -import torch.nn as nn - -class TestModule(nn.Module): - def __init__(self): - super(TestModule, self).__init__() - self.conv1 = nn.Conv2d(1, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.fc1 = nn.Linear(120,84) - - def forward(self, x): - # x = self.conv1(x) - # x = self.pool(x) - x = x.view(-1, 120) - x = self.fc1(x) - return x - diff --git a/examples/BuddyTest/test-main.cpp b/examples/BuddyTest/test-main.cpp deleted file mode 100644 index d1764bccd2..0000000000 --- a/examples/BuddyTest/test-main.cpp +++ /dev/null @@ -1,115 +0,0 @@ -//===- test-main.cpp ------------------------------------------------------===// -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include - -using namespace buddy; - -// extern "C" void -// _mlir_ciface_forward(MemRef *result, MemRef *filter, MemRef *bias, MemRef *input); - -extern "C" void -_mlir_ciface_forward(MemRef *result, MemRef *input); - -int main() { - /// Initialize data containers. - const int N = 1; - const int C = 1; - const int K = 1; - const int kernel_size = 2; - const int stride = 2; - const int H = 32; - const int W = 32; - const int H_out = H / kernel_size; - const int W_out = W / kernel_size; - - MemRef input({N, C, H, W}); - // MemRef filter({K, C, kernel_size, kernel_size}); - // MemRef bias({K}); - MemRef result({N, C, H_out, W_out}); - - // Initial the input data - for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { - for (int i = 0; i < H; i++) { - for (int j = 0; j < W; j++) { - int index = n * C * H * W + c * H * W + i * W + j; - input[index] = static_cast((float)index/(H*W)); - } - } - } - } - // for (int k = 0; k < K; k++) { - // for (int c = 0; c < C; c++) { - // for (int i = 0; i < kernel_size; i++) { - // for (int j = 0; j < kernel_size; j++) { - // int index = k * C * kernel_size * kernel_size + c * kernel_size * kernel_size + i * kernel_size + j; - // filter[index] = static_cast(1); - // } - // } - // } - // } - - // for (int k = 0; k < K; k++) { - // bias[k] = 1; - // } - - // Print the generated data to verify - - // for (int i = 0; i < H; i++) { - // for (int j = 0; j < W; j++) { - // std::cout << input[i * W + j] << " "; - // } - // std::cout << std::endl; - // } - - const auto inferenceStart = std::chrono::high_resolution_clock::now(); - - /// Execute forward inference of the model. - _mlir_ciface_forward(&result, &input); - - const auto inferenceEnd = std::chrono::high_resolution_clock::now(); - const std::chrono::duration inferenceTime = - inferenceEnd - inferenceStart; - - /// Print the output data for verification. - std::cout << "\033[33;1m[Output] \033[0m"; - std::cout << "["; - for (int i = 0; i < H_out; i++) { - if (i > 0) std::cout << " "; - std::cout << "["; - for (int j = 0; j < W_out; j++) { - if (j > 0) std::cout << " "; - std::cout << result[i * W_out + j]; - } - std::cout << "]"; - if (i < H_out - 1) std::cout << "\n "; - } - std::cout << "]" << std::endl; - - /// Print the performance. - std::cout << "\033[33;1m[Time] \033[0m"; - std::cout << inferenceTime.count() << " ms" - << std::endl; - - return 0; -} diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c4449a0a81..3aa1195d10 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,10 +16,6 @@ if (BUDDY_LENET_EXAMPLES) add_subdirectory(BuddyLeNet) endif() -if (BUDDY_TEST_EXAMPLES) - add_subdirectory(BuddyTest) -endif() - if(BUDDY_WHISPER_EXAMPLES) add_subdirectory(BuddyWhisper) endif() diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py index 4d613473a0..9d8c80f014 100644 --- a/frontend/Python/frontend.py +++ b/frontend/Python/frontend.py @@ -42,7 +42,6 @@ from .ops.tosa import ops_registry as tosa_ops_registry from .ops.math import ops_registry as math_ops_registry from .ops.func import ops_registry as func_ops_registry -from .ops.gpu import ops_registry as gpu_ops_registry from .graph import Graph, TensorDType, TensorMeta from .graph.operation import * from .graph.transform import maxpool2d_simplify @@ -99,14 +98,12 @@ def __init__( self._verbose = verbose self._imported_graphs = [] self._ops_registry = {} - self._ops_gpu_registry = {} self._imported_params = {} self._ops_registry.update(math_ops_registry) self._ops_registry.update(linalg_ops_registry) self._ops_registry.update(tosa_ops_registry) self._ops_registry.update(func_ops_registry) self._ops_registry.update(primary_registry) - self._ops_gpu_registry.update(gpu_ops_registry) self._ops_map = { "output": OutputOp, "placeholder": PlaceholderOp, @@ -286,7 +283,6 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]): func_inputs, fake_params, self._ops_registry, - self._ops_gpu_registry, self._func_name, self._verbose ) diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 6a18f8b80d..c7239a0d7d 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -107,7 +107,6 @@ def __init__( inputs: List[TensorMeta], fake_params: List[TensorMeta], ops_registry: dict, - ops_gpu_registry: dict, func_name: str, device: DeviceType = DeviceType.CPU, verbose=False @@ -133,7 +132,6 @@ def __init__( self._imported_module = None self._verbose = verbose self._ops_registry = ops_registry - self._ops_gpu_registry = ops_gpu_registry self._func_name = func_name self._ctx = ir.Context() self._output_memref = None @@ -182,7 +180,7 @@ def init_op_group(self): # continue # group = [op] # subgraph_name = "subgraph{}".format(i) - # self.group_map_device[subgraph_name] = DeviceType.GPU + # self.group_map_device[subgraph_name] = DeviceType.CPU # self.op_groups[subgraph_name] = group group = [] for i, op in enumerate(self._body): @@ -258,7 +256,6 @@ def lower_to_top_level_ir(self): self._inputs, self._func_name, self._ops_registry, - self._ops_gpu_registry, False, self.device, verbose=self._verbose @@ -455,7 +452,6 @@ def __init__( inputs: List[TensorMeta], func_name: str, ops_registry: dict, - ops_gpu_registry: dict, do_param_pack: bool = False, device: DeviceType = DeviceType.CPU, verbose=False @@ -483,7 +479,6 @@ def __init__( self._num_input_visited = 0 self._module = ir.Module.create() self._ops_registry = ops_registry - self._ops_gpu_registry = ops_gpu_registry self._current_param_pack_offset = None def _str_to_mlir_dtype(self, dtype: str) -> ir.Type: @@ -577,11 +572,11 @@ def generated_func(*args): self._symbol_table.get((str(output_arg), 0)) for output_arg in output_node_args ] - if self._device == DeviceType.GPU: - returns = [ - buffer.to_tensor(ret) - for ret in returns - ] + # if self._device == DeviceType.GPU: + # returns = [ + # buffer.to_tensor(ret) + # for ret in returns + # ] self._symbol_table[("output", 0)] = returns elif isinstance(node, PlaceholderOp): self._import_placeholder(node, args_list) @@ -609,8 +604,8 @@ def generated_func(*args): return self._symbol_table.get(("output", 0)) - if self._device == DeviceType.GPU: - self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get() + # if self._device == DeviceType.GPU: + # self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get() return self._module @@ -712,14 +707,14 @@ def _import_placeholder( placeholder_name = args_list[self._num_input_visited] # TODO : Consider converting arg type from RankedTensorType to MemRefType - if self._device == DeviceType.GPU: - placeholder_name = buffer.to_memref( - ir.MemRefType.get( - list(node.tensor_meta.shape), - self._str_to_mlir_dtype(node.tensor_meta.dtype) - ), - placeholder_name - ) + # if self._device == DeviceType.GPU: + # placeholder_name = buffer.to_memref( + # ir.MemRefType.get( + # list(node.tensor_meta.shape), + # self._str_to_mlir_dtype(node.tensor_meta.dtype) + # ), + # placeholder_name + # ) self._symbol_table[(str(node.name), 0)] = placeholder_name self._num_input_visited += 1 @@ -734,14 +729,14 @@ def _import_op(self, node: Op): """ op_name = node.__class__.__name__ - if self._device == DeviceType.CPU: - op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( - self._ops_registry[op_name](node, self._symbol_table) - ) - else: - op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( - self._ops_gpu_registry[op_name](node, self._symbol_table) - ) + # if self._device == DeviceType.CPU: + op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( + self._ops_registry[op_name](node, self._symbol_table) + ) + # else: + # op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( + # self._ops_gpu_registry[op_name](node, self._symbol_table) + # ) if isinstance(op_ret, tuple | List): for i, operation in enumerate(op_ret): if isinstance(operation, ir.Operation) or isinstance( diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py index 356eb0922b..58e7766cb1 100644 --- a/frontend/Python/graph/graph_driver.py +++ b/frontend/Python/graph/graph_driver.py @@ -153,7 +153,6 @@ def build_subgraph_by_group(self): subgraph_input, [], self._graph._ops_registry, - self._graph._ops_gpu_registry, subgraph_name, subgraph_device, verbose=self._graph._verbose @@ -217,7 +216,6 @@ def construct_main_graph(self, do_param_pack=False): self._graph._inputs, self._graph._fake_params, self._graph._ops_registry, - self._graph._ops_gpu_registry, self._graph._func_name, self._graph._verbose ) @@ -298,7 +296,6 @@ def construct_main_graph(self, do_param_pack=False): main_graph._inputs, main_graph._func_name, main_graph._ops_registry, - main_graph._ops_gpu_registry, do_param_pack, ) return main_importer.import_main_graph() diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py index d8bac5c77a..cfa825b0aa 100644 --- a/frontend/Python/graph/json_decoder.py +++ b/frontend/Python/graph/json_decoder.py @@ -10,7 +10,6 @@ from ..ops.tosa import ops_registry as tosa_ops_registry from ..ops.math import ops_registry as math_ops_registry from ..ops.func import ops_registry as func_ops_registry -from ..ops.gpu import ops_registry as gpu_ops_registry def json_to_graph(json_str): """ @@ -59,7 +58,6 @@ def json_to_tensormeta(json_data): inputs, params, ops_registry, - gpu_ops_registry, graph_name ) graph.device = _graph['device'] diff --git a/frontend/Python/ops/gpu.py b/frontend/Python/ops/gpu.py deleted file mode 100644 index 9c8a5265e3..0000000000 --- a/frontend/Python/ops/gpu.py +++ /dev/null @@ -1,729 +0,0 @@ -# ===- gpu.py ----------------------------------------------------------------- -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ===--------------------------------------------------------------------------- -# -# The registry of mappings from Buddy node to MLIR GPU kernel. -# -# ===--------------------------------------------------------------------------- - - -from typing import Tuple -import mlir.ir as ir -from mlir.dialects import gpu, memref, arith, scf, vector - -from ..graph import TensorDType -from ..graph import ( - ReluOp, - ReshapeOp, - PermuteOp, - Conv2dOp, - MaxPool2dOp, - AddMMOp -) -from .utils import * - -TILE_WIDTH = 16 - -def relu_op(node: ReluOp, symbol_table: Dict[Tuple[str, int], ir.Operation]): - """ - Import the buddy ReluOp. - From Buddy ReluOp to MLIR Relu GPU kernel. - """ - assert len(node.args) == 1 - input = symbol_table.get((str(node.args[0]), 0)) - if input is None: - return - output_shape = list(node.tensor_meta["shape"]) - dtype = node.tensor_meta["dtype"] - element_type = mlir_element_type_get(dtype) - - c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) - c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) - kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) - - # Flatten the input into a one-dimensional format - output_size = tensor_shape_size(output_shape) - size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size)) - shape = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) - memref.StoreOp(size, shape, [c0]) - memref_reshape_type = ir.MemRefType.get([output_size], element_type) - input_reshape = memref.ReshapeOp(memref_reshape_type, input, shape) - - unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) - input_cast = memref.CastOp(unranked_memref_type, input) - gpu.HostRegisterOp(input_cast) - gpu_kernel = gpu.LaunchOp( - asyncToken=None, - asyncDependencies=[], - gridSizeX=c1.result, - gridSizeY=c1.result, - gridSizeZ=c1.result, - blockSizeX=kernels.result, - blockSizeY=c1.result, - blockSizeZ=c1.result, - ) - gpu_kernel_block = ir.Block.create_at_start( - gpu_kernel.body, - [ - ir.IndexType.get(), # block_id x - ir.IndexType.get(), # block_id y - ir.IndexType.get(), # block_id z - ir.IndexType.get(), # thread_id x - ir.IndexType.get(), # thread_id y - ir.IndexType.get(), # thread_id z - ir.IndexType.get(), # grid_size x - ir.IndexType.get(), # grid_size y - ir.IndexType.get(), # grid_size z - ir.IndexType.get(), # block_size x - ir.IndexType.get(), # block_size y - ir.IndexType.get(), # block_size z - ] - ) - - with ir.InsertionPoint(gpu_kernel_block): - thread_local_idx = gpu_kernel_block.arguments[3] - element_attr = mlir_element_attr_get(dtype, 0.0) - cst_0 = arith.ConstantOp(element_type, element_attr) - loop = scf.ForOp( - lower_bound=thread_local_idx, - upper_bound=size, - step=gpu_kernel.blockSizeX - ) - with ir.InsertionPoint(loop.body): - load = memref.LoadOp(input_reshape, [loop.induction_variable]) - result = arith.MaxNumFOp(load, cst_0) - memref.StoreOp(result, input_reshape, [loop.induction_variable]) - scf.YieldOp([]) - - gpu.TerminatorOp() - - gpu.HostUnregisterOp(input_cast) - output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - memref.CopyOp(input, output) - return output - - -# TODO: Implement Reshape Operation on GPU in future revisions. -def reshape_op(node: ReshapeOp, symbol_table): - """ - Import the reshape operation. - From buddy graph ir's `ReshapeOp` operator to MLIR Memref `reshape` - operation. - - Note: If the new shape contains one and only one `-1`, the size of the new - shape will be inferred automatically. - """ - input1 = symbol_table.get((str(node.args[0]), 0)) - new_shape = [] - for i in node.args[1]: - new_shape.append(i) - output_shape = list(node.tensor_meta["shape"]) - total_size = tensor_shape_size(output_shape) - - neg_one_cnt = 0 - rest_size = 1 - for dim_siz in new_shape: - if dim_siz == -1: - neg_one_cnt += 1 - continue - rest_size *= dim_siz - - if neg_one_cnt != 0: - if neg_one_cnt > 1 or total_size % rest_size != 0: - raise ValueError("Can not infer the new shape!") - infer_dim_size = total_size // rest_size - for i, _ in enumerate(new_shape): - if new_shape[i] == -1: - new_shape[i] = infer_dim_size - - shape = memref.AllocOp(ir.MemRefType.get([len(new_shape)], ir.IndexType.get()), [], []) - for i, _ in enumerate(new_shape): - c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i)) - size = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), new_shape[i])) - memref.StoreOp(size, shape, [c]) - - dtype = node.tensor_meta["dtype"] - element_type = mlir_element_type_get(dtype) - output_type = ir.MemRefType.get(new_shape, element_type) - op = memref.ReshapeOp(output_type, input1, shape) - - return op - - -# TODO: Implement Permute Operation on GPU in future revisions. -def permute_op(node: PermuteOp, symbol_table): - """ - Import the permute operation. - From buddy graph ir's `PermuteOp` operator to MLIR Memref `transpose` - operation. - """ - input1 = symbol_table.get((str(node.args[0]), 0)) - perm_map = node.args[1] - perm_map_attr = ir.AffineMapAttr.get(ir.AffineMap.get_permutation(perm_map)) - - output_shape = list(node.tensor_meta["shape"]) - dtype = node.tensor_meta["dtype"] - - element_type = mlir_element_type_get(dtype) - element_attr = mlir_element_attr_get(dtype, 0.0) - - c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) - f0 = arith.ConstantOp(element_type, element_attr) - - v0 = vector.transfer_read( - vector=ir.VectorType.get(output_shape, element_type), - source=input1, - indices=[c0]*len(output_shape), - permutation_map=perm_map_attr, - padding=f0 - ) - - transpose = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - - vector.transfer_write( - result=None, - vector=v0, - source=transpose, - indices=[c0]*len(output_shape), - permutation_map=ir.AffineMapAttr.get( - ir.AffineMap.get_permutation([i for i in range(len(output_shape))]) - ) - ) - return transpose - - -# TODO: Consider the cases where the arguments take different values. -def convolution2d_op(node: Conv2dOp, symbol_table): - """ - Import the convolution operation. - From Buddy Conv2dOp to MLIR GPU `conv2d` kernel. - arg[0]: Tensor input - arg[1]: Tensor weight - arg[2]: Tensor? bias - arg[3]: SymInt[] stride - arg[4]: SymInt[] padding - arg[5]: SymInt[] dilation - arg[6]: bool transposed - arg[7]: SymInt[] output_padding - arg[8]: SymInt groups - """ - # Get arguments from convolution node. - assert len(node.args) == 9 - input = node.args[0] - filter = node.args[1] - bias = node.args[2] - stride = node.args[3] - input_padding = node.args[4] - dilation = node.args[5] - is_kernel_transposed = node.args[6] - out_padding = node.args[7] - groups = node.args[8] - - # TODO: Consider the cases where the variables take different values. - assert input_padding[0] == input_padding[1] == 0 - assert dilation[0] == dilation[1] == 1 - assert is_kernel_transposed == False - assert out_padding[0] == out_padding[1] == 0 - assert groups == 1 - - # Prepare input, filter, and output information. - input_val = symbol_table.get((str(input), 0)) - input_shape = list(ir.MemRefType(input_val.type).shape) - filter_val = symbol_table.get((str(filter), 0)) - filter_shape = ir.MemRefType(filter_val.type).shape - bias_val = symbol_table.get((str(bias), 0)) - dtype = node.tensor_meta["dtype"] - element_type = mlir_element_type_get(dtype) - output_shape = list(node.tensor_meta["shape"]) - - batch_size = input_shape[0] - in_channels = input_shape[1] - out_channels = output_shape[1] - in_size_h = input_shape[2] - in_size_w = input_shape[3] - out_size_h = output_shape[2] - out_size_w = output_shape[3] - H_filter = filter_shape[2] - W_filter = filter_shape[3] - - output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) - input_cast = memref.CastOp(unranked_memref_type, input_val) - filter_cast = memref.CastOp(unranked_memref_type, filter_val) - bias_cast = memref.CastOp(unranked_memref_type, bias_val) - output_cast = memref.CastOp(unranked_memref_type, output_val) - - gpu.HostRegisterOp(input_cast) - gpu.HostRegisterOp(filter_cast) - gpu.HostRegisterOp(bias_cast) - gpu.HostRegisterOp(output_cast) - - # Tile the input_val into Grids - block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH) - batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) - in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) - out_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_channels)) - block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z)) - tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH)) - H_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), H_filter)) - W_filter_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), W_filter)) - c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) - c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) - - # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1) numBlocks(N, K, block_z) - - gpu_kernel = gpu.LaunchOp( - asyncToken=None, - asyncDependencies=[], - gridSizeX=batch_size_val.result, - gridSizeY=out_channels_val.result, - gridSizeZ=block_z_val.result, - blockSizeX=tile_width_val.result, - blockSizeY=tile_width_val.result, - blockSizeZ=c1.result, - ) - - gpu_kernel_block = ir.Block.create_at_start( - gpu_kernel.body, - [ - ir.IndexType.get(), # block_id x - ir.IndexType.get(), # block_id y - ir.IndexType.get(), # block_id z - ir.IndexType.get(), # thread_id x - ir.IndexType.get(), # thread_id y - ir.IndexType.get(), # thread_id z - ir.IndexType.get(), # grid_size x - ir.IndexType.get(), # grid_size y - ir.IndexType.get(), # grid_size z - ir.IndexType.get(), # block_size x - ir.IndexType.get(), # block_size y - ir.IndexType.get(), # block_size z - ] - ) - - with ir.InsertionPoint(gpu_kernel_block): - batch_id = gpu_kernel_block.arguments[0] - out_channel_id = gpu_kernel_block.arguments[1] - tile_id = gpu_kernel_block.arguments[2] - thread_local_idx = gpu_kernel_block.arguments[3] - thread_local_idy = gpu_kernel_block.arguments[4] - - # Calculate the convolution element at (h, w) for this thread - tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH - tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) - - t0 = arith.divui(tile_id, tile_num_val) - t1 = arith.muli(t0, tile_width_val) - thread_global_idx = arith.addi(t1, thread_local_idx) - - t2 = arith.remui(tile_id, tile_num_val) - t3 = arith.muli(t2, tile_width_val) - thread_global_idy = arith.addi(t3, thread_local_idy) - - stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0])) - stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1])) - t4 = arith.muli(thread_global_idx, stride_h) - t5 = arith.muli(thread_global_idy, stride_w) - - # Check if the (h, w) is out of the output bounds - ult = 6 - out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h)) - out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w)) - isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val) - isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val) - isInBounds = arith.andi(isHInBounds, isWInBounds) - - cst_0 = arith.ConstantOp(element_type, mlir_element_attr_get(dtype, 0.0)) - branch0 = scf.IfOp(isInBounds) - with ir.InsertionPoint(branch0.then_block): - loop0 = scf.ForOp( - lower_bound=c0.result, - upper_bound=in_channels_val.result, - step=c1.result, - iter_args=[cst_0.result] - ) - with ir.InsertionPoint(loop0.body): - loop1 = scf.ForOp( - lower_bound=c0.result, - upper_bound=H_filter_val.result, - step=c1.result, - iter_args=[cst_0.result] - ) - with ir.InsertionPoint(loop1.body): - loop2 = scf.ForOp( - lower_bound=c0.result, - upper_bound=W_filter_val.result, - step=c1.result, - iter_args=[cst_0.result] - ) - with ir.InsertionPoint(loop2.body): - # TODO : loop body - in_channel_id = loop0.body.arguments[0] - filter_ele_idx = loop1.body.arguments[0] - filter_ele_idy = loop2.body.arguments[0] - input_ele_idx = arith.addi(t4, filter_ele_idx) - input_ele_idy = arith.addi(t5, filter_ele_idy) - input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy]) - filter_ele = memref.LoadOp(filter_val, [out_channel_id, in_channel_id, filter_ele_idx, filter_ele_idy]) - t6 = arith.mulf(input_ele, filter_ele) - iter_arg2 = loop2.body.arguments[1] - iter_res2 = arith.addf(iter_arg2, t6) - scf.YieldOp([iter_res2]) - - iter_arg1 = loop1.body.arguments[1] - iter_res1 = arith.addf(loop2, iter_arg1) - scf.YieldOp([iter_res1]) - - iter_arg0 = loop0.body.arguments[1] - iter_res0 = arith.addf(loop1, iter_arg0) - scf.YieldOp([iter_res0]) - - # Add bias data for any out_channel. - bias_ele = memref.LoadOp(bias_val, [out_channel_id]) - result = arith.addf(loop0, bias_ele) - memref.StoreOp(result, output_val, [batch_id, out_channel_id, thread_global_idx, thread_global_idy]) - scf.YieldOp([]) - - gpu.TerminatorOp() - - gpu.HostUnregisterOp(input_cast) - gpu.HostUnregisterOp(filter_cast) - gpu.HostUnregisterOp(bias_cast) - gpu.HostUnregisterOp(output_cast) - - return output_val - - -# TODO: Consider the cases where the maxpool2d operation needs padding. -def maxpool2d_op(node: MaxPool2dOp, symbol_table): - """ - Import the maxpool2d operation. - From Buddy MaxPool2dOp to MLIR GPU `max_pool2d` kernel. - """ - if len(node.args) == 5: - raise NotImplementedError - input1 = node.args[0] - kernel = node.args[1] - stride = node.args[2] - - # Prepare padding data - if len(node.args) > 3: - pad = node.args[3] - else: - pad = [0 for _ in kernel] - - dtype = node.tensor_meta["dtype"] - element_type = mlir_element_type_get(dtype) - output_shape = node.tensor_meta["shape"] - - batch_size = output_shape[0] - in_channels = output_shape[1] - out_size_h = output_shape[2] - out_size_w = output_shape[3] - - input_val = symbol_table.get((str(input1), 0)) - output_val = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) - input_cast = memref.CastOp(unranked_memref_type, input_val) - output_cast = memref.CastOp(unranked_memref_type, output_val) - - gpu.HostRegisterOp(input_cast) - gpu.HostRegisterOp(output_cast) - - # Tile the input_val into Grids - block_z = ((out_size_h + TILE_WIDTH - 1) // TILE_WIDTH) * ((out_size_w + TILE_WIDTH - 1) // TILE_WIDTH) - batch_size_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), batch_size)) - in_channels_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), in_channels)) - block_z_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), block_z)) - tile_width_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), TILE_WIDTH)) - c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) - c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) - - # threadsPerBlock(TILE_WIDTH, TILE_WIDTH, 1) numBlocks(N, K, block_z) - - gpu_kernel = gpu.LaunchOp( - asyncToken=None, - asyncDependencies=[], - gridSizeX=batch_size_val.result, - gridSizeY=in_channels_val.result, - gridSizeZ=block_z_val.result, - blockSizeX=tile_width_val.result, - blockSizeY=tile_width_val.result, - blockSizeZ=c1.result, - ) - - gpu_kernel_block = ir.Block.create_at_start( - gpu_kernel.body, - [ - ir.IndexType.get(), # block_id x - ir.IndexType.get(), # block_id y - ir.IndexType.get(), # block_id z - ir.IndexType.get(), # thread_id x - ir.IndexType.get(), # thread_id y - ir.IndexType.get(), # thread_id z - ir.IndexType.get(), # grid_size x - ir.IndexType.get(), # grid_size y - ir.IndexType.get(), # grid_size z - ir.IndexType.get(), # block_size x - ir.IndexType.get(), # block_size y - ir.IndexType.get(), # block_size z - ] - ) - - with ir.InsertionPoint(gpu_kernel_block): - batch_id = gpu_kernel_block.arguments[0] - in_channel_id = gpu_kernel_block.arguments[1] - tile_id = gpu_kernel_block.arguments[2] - thread_local_idx = gpu_kernel_block.arguments[3] - thread_local_idy = gpu_kernel_block.arguments[4] - - # Calculate the convolution element at (h, w) for this thread - tile_num = (out_size_w + TILE_WIDTH - 1) // TILE_WIDTH - tile_num_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), tile_num)) - - t0 = arith.divui(tile_id, tile_num_val) - t1 = arith.muli(t0, tile_width_val) - thread_global_idx = arith.addi(t1, thread_local_idx) - - t2 = arith.remui(tile_id, tile_num_val) - t3 = arith.muli(t2, tile_width_val) - thread_global_idy = arith.addi(t3, thread_local_idy) - - kernel_size_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[0])) - kernel_size_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), kernel[1])) - stride_h = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[0])) - stride_w = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), stride[1])) - init_ele_idx = arith.muli(thread_global_idx, stride_h) - init_ele_idy = arith.muli(thread_global_idy, stride_w) - - # Check if the (h, w) is out of the output bounds - ult = 6 - out_size_h_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_h)) - out_size_w_val = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), out_size_w)) - isHInBounds = arith.cmpi(ult, thread_global_idx, out_size_h_val) - isWInBounds = arith.cmpi(ult, thread_global_idy, out_size_w_val) - isInBounds = arith.andi(isHInBounds, isWInBounds) - - branch0 = scf.IfOp(isInBounds) - with ir.InsertionPoint(branch0.then_block): - first_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, init_ele_idx, init_ele_idy]) - loop0 = scf.ForOp( - lower_bound=c0.result, - upper_bound=kernel_size_h.result, - step=c1.result, - iter_args=[first_ele.result] - ) - with ir.InsertionPoint(loop0.body): - loop1 = scf.ForOp( - lower_bound=c0.result, - upper_bound=kernel_size_w.result, - step=c1.result, - iter_args=[first_ele.result] - ) - with ir.InsertionPoint(loop1.body): - # TODO : loop body - kernel_ele_idx = loop0.body.arguments[0] - kernel_ele_idy = loop1.body.arguments[0] - input_ele_idx = arith.addi(init_ele_idx, kernel_ele_idx) - input_ele_idy = arith.addi(init_ele_idy, kernel_ele_idy) - input_ele = memref.LoadOp(input_val, [batch_id, in_channel_id, input_ele_idx, input_ele_idy]) - iter_arg1 = loop1.body.arguments[1] - iter_res1 = arith.maxnumf(iter_arg1, input_ele) - scf.YieldOp([iter_res1]) - - iter_arg0 = loop0.body.arguments[1] - iter_res0 = arith.maxnumf(loop1, iter_arg0) - scf.YieldOp([iter_res0]) - - memref.StoreOp(loop0, output_val, [batch_id, in_channel_id, thread_global_idx, thread_global_idy]) - scf.YieldOp([]) - - gpu.TerminatorOp() - - gpu.HostUnregisterOp(input_cast) - gpu.HostUnregisterOp(output_cast) - - return output_val - - -def addmm_op( - node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation] -): - dtype = node.tensor_meta["dtype"] - element_type = mlir_element_type_get(dtype) - c0 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 0)) - c1 = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 1)) - kernels = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), 512)) - - # TODO: Reverse the order of the mat2 before multiplication to optimize the cache hit rate - - input_data = symbol_table.get((str(node.args[1]), 0), node.args[1]) - weight = symbol_table.get((str(node.args[2]), 0), node.args[2]) - bias = symbol_table.get((str(node.args[0]), 0), node.args[0]) - # print("input_data: "+str(input_data)) - # print("weight: "+str(weight)) - # print("bias: "+str(bias)) - - # TODO: Transpose of the mat2 before multiplication to optimize the cache hit rate - - output_shape = list(node.tensor_meta["shape"]) - input_shape = input_data.type.shape - weight_shape = weight.type.shape - # print("output_shape: "+str(output_shape)) - # print("output_shape: "+str()) - # print("input_shape: "+str(input_shape)) - # print("weight_shape: "+str(weight_shape)) - # print("bias shape: "+str(bias.type.shape)) - - # Flatten the input into a one-dimensional format - input_size = tensor_shape_size(input_shape) - weight_size = tensor_shape_size(weight_shape) - output_size = tensor_shape_size(output_shape) - # print("input_size: "+str(input_size)) - # print("weight_size: "+str(weight_size)) - # print("output_size: "+str(output_size)) - - input_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_size)) - weight_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_size)) - output_size_c = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_size)) - # print("input_size_c: "+str(input_size_c)) - # print("weight_size_c: "+str(weight_size_c)) - # print("output_size_c: "+str(output_size_c)) - - input_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) - weight_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) - bias_shape_1d = memref.AllocOp(ir.MemRefType.get([1], ir.IndexType.get()), [], []) - # print("input_shape_1d: "+str(input_shape_1d)) - # print("weight_shape_1d: "+str(weight_shape_1d)) - # print("bias_shape_1d: "+str(bias_shape_1d)) - - memref.StoreOp(input_size_c, input_shape_1d, [c0]) - memref.StoreOp(weight_size_c, weight_shape_1d, [c0]) - memref.StoreOp(output_size_c, bias_shape_1d, [c0]) - - input_reshape_type = ir.MemRefType.get([input_size], element_type) - weight_reshape_type = ir.MemRefType.get([weight_size], element_type) - bias_reshape_type = ir.MemRefType.get([output_size], element_type) - output_type = ir.MemRefType.get(output_shape, element_type) - # print("input_reshape_type: "+str(input_reshape_type)) - # print("weight_reshape_type: "+str(weight_reshape_type)) - # print("bias_reshape_type: "+str(bias_reshape_type)) - # print("output_type: "+str(output_type)) - - input_reshape_1d = memref.ReshapeOp(input_reshape_type, input_data, input_shape_1d) - weight_reshape_1d = memref.ReshapeOp(weight_reshape_type, weight, weight_shape_1d) - bias_reshape_1d = memref.ReshapeOp(bias_reshape_type, bias, bias_shape_1d) - # print("input_reshape: "+str(input_reshape_1d)) - # print("weight_reshape: "+str(weight_reshape_1d)) - # print("bias_reshape: "+str(bias_reshape_1d)) - - - unranked_memref_type = ir.UnrankedMemRefType.get(element_type, ir.IntegerAttr.get(ir.IndexType.get(), 0)) - gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, input_reshape_1d)) - gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, weight_reshape_1d)) - gpu.HostRegisterOp(memref.CastOp(unranked_memref_type, bias_reshape_1d)) - - row = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[0])) - col = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), weight_shape[1])) - inner_dim = arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), input_shape[1])) - - gpu_kernel = gpu.LaunchOp( - asyncToken=None, - asyncDependencies=[], - gridSizeX=c1.result, gridSizeY=c1.result, gridSizeZ=c1.result, - blockSizeX=kernels.result, blockSizeY=c1.result, blockSizeZ=c1.result, - ) - gpu_kernel_block = ir.Block.create_at_start( - gpu_kernel.body, - [ - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_idx, block_idy, block_idz - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # thread_idx , thread_idy, thread_idz - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # grid_size x, grid_size y, grid_size z - ir.IndexType.get(), ir.IndexType.get(), ir.IndexType.get(), # block_size x, block_size y, block_size z - ] - ) - - # TODO: optimize to one dimension - with ir.InsertionPoint(gpu_kernel_block): - tIdX = gpu_kernel_block.arguments[3] - tIdY = gpu_kernel_block.arguments[4] - otter_loop = scf.ForOp( - lower_bound=tIdX, - upper_bound=row, - step=gpu_kernel.blockSizeX - ) - with ir.InsertionPoint(otter_loop.body): - inner_loop = scf.ForOp( - lower_bound=tIdY, - upper_bound=col, - step=gpu_kernel.blockSizeY - ) - with ir.InsertionPoint(inner_loop.body): - initial_sum = arith.ConstantOp(ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0.0)) - - mul_loop = scf.ForOp( - lower_bound=c0.result, - upper_bound=inner_dim, - step=c1.result, - iter_args=[initial_sum] - ) - with ir.InsertionPoint(mul_loop.body): - sum = mul_loop.inner_iter_args[0] - mat1_load = memref.LoadOp(input_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, inner_dim).result, mul_loop.induction_variable)]) - mat2_load = memref.LoadOp(weight_reshape_1d, [arith.AddIOp(arith.MulIOp(mul_loop.induction_variable, col).result, inner_loop.induction_variable)]) - res = arith.MulFOp(mat1_load, mat2_load) - res = arith.AddFOp(sum, res) - scf.YieldOp([res]) - - sum = mul_loop.result - bias_load = memref.LoadOp(bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)]) - res = arith.AddFOp(sum, bias_load) - memref.StoreOp(res, bias_reshape_1d, [arith.AddIOp(arith.MulIOp(otter_loop.induction_variable, col).result, inner_loop.induction_variable)]) - scf.YieldOp([]) - scf.YieldOp([]) - gpu.TerminatorOp() - - - output = memref.AllocOp(ir.MemRefType.get(output_shape, element_type), [], []) - - # FIXME: Dialect `memref' not found for custom op 'memref.expand_shape' - # axis = ir.ArrayAttr.get( - # [ - # ir.IntegerAttr.get(ir.IntegerType.get_signless(64), i) - # for i in range(len(output_shape)) - # ], - # None, - # ) - # axis = ir.ArrayAttr.get([axis], None) - # bias_reshape = memref.ExpandShapeOp(output_type, bias, axis) - - bias_shape = memref.AllocOp(ir.MemRefType.get([len(output_shape)], ir.IndexType.get()), [], []) - # print("bias_shape: "+str(bias_shape)) - for i in range(len(output_shape)): - memref.StoreOp(arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), output_shape[i])), bias_shape, [arith.ConstantOp(ir.IndexType.get(), ir.IntegerAttr.get(ir.IndexType.get(), i))]) - - bias_reshape = memref.ReshapeOp(output_type, bias, bias_shape) - memref.CopyOp(bias_reshape, output) - return output - - -ops_registry = { - "ReluOp": relu_op, - "ViewOp": reshape_op, - "PermuteOp": permute_op, - "Conv2dOp": convolution2d_op, - "MaxPool2dOp": maxpool2d_op, - "AddMMOp": addmm_op -} diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp index dd50feccf8..f616127930 100644 --- a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp +++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp @@ -18,11 +18,9 @@ // //===---------------------------------------------------------------------===// -#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OperationSupport.h" #include "mlir/IR/TypeRange.h" @@ -30,9 +28,7 @@ #include "mlir/IR/Visitors.h" #include "mlir/Support/LLVM.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include #include #include @@ -42,11 +38,8 @@ #include #include -#include -#include -#include -#include -#include +#include + using namespace mlir; using namespace vector; @@ -82,6 +75,9 @@ class ConvertMemcpyToGPUPass void ConvertMemcpyToGPUPass::runOnOperation() { auto funcOp = getOperation(); + if (funcOp.isDeclaration() || funcOp.isExternal()) + return; + // Make sure the gpu function is already outlined. funcOp->walk([&](Operation *nestedOp) { if (auto gpuLaunchOp = dyn_cast(nestedOp)) { @@ -90,8 +86,9 @@ void ConvertMemcpyToGPUPass::runOnOperation() { return WalkResult::advance(); }); - std::set unDeallocatedOperations; + std::vector unDeallocatedValue; OpBuilder builder(funcOp->getContext()); + // Copy all function arguments to gpu, needs deallocation if (processArgs) { builder.setInsertionPointToStart(&(funcOp.getBody().front())); @@ -103,23 +100,11 @@ void ConvertMemcpyToGPUPass::runOnOperation() { auto memrefType = dyn_cast(arg.getType()); auto gpuAllocOp = builder.create( builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({})); - unDeallocatedOperations.insert(&gpuAllocOp); + unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); auto gpuMemcpyOp = builder.create( gpuAllocOp.getLoc(), TypeRange(), ValueRange(), gpuAllocOp.getResult(0), arg); - // Replace all users with GPU memory - auto users = arg.getUsers(); - std::vector usersVec(users.begin(), users.end()); - for (auto user : usersVec) { - // Don't replace memcpy's operand - if (isa(user)) - continue; - for (size_t j = 0; j < user->getNumOperands(); j++) { - if (user->getOperand(j) == arg) { - user->setOperand(j, gpuAllocOp.getResult(0)); - } - } - } + arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp); } } @@ -149,19 +134,18 @@ void ConvertMemcpyToGPUPass::runOnOperation() { auto gpuAllocOp = builder.create( allocOp->getLoc(), TypeRange({memrefType}), ValueRange({})); - auto users = result.getUsers(); - std::vector usersVec(users.begin(), users.end()); - for (auto user : usersVec) { - for (size_t j = 0; j < user->getNumOperands(); j++) { - // Only the return value will not have dealloc op - if (auto deallocOp = dyn_cast(user)) { - builder.setInsertionPointAfter(deallocOp); - auto gpuDeallocOp = builder.create( - deallocOp->getLoc(), TypeRange(), ValueRange(), - gpuAllocOp.getResult(0)); - deallocOp->erase(); - } else if (user->getOperand(j) == result) { - user->setOperand(j, gpuAllocOp.getResult(0)); + + for (auto user : llvm::make_early_inc_range(result.getUsers())) { + if (auto deallocOp = dyn_cast(user)) { + builder.setInsertionPointAfter(deallocOp); + builder.create(deallocOp->getLoc(), TypeRange(), + ValueRange(), gpuAllocOp.getResult(0)); + deallocOp->erase(); + } else { + for (auto &opOperand : user->getOpOperands()) { + if (opOperand.is(result)) { + opOperand.set(gpuAllocOp.getResult(0)); + } } } } @@ -175,28 +159,8 @@ void ConvertMemcpyToGPUPass::runOnOperation() { builder.setInsertionPointAfter(copyOp); auto gpuMemcpyOp = builder.create( copyOp->getLoc(), TypeRange(), ValueRange(), dst, src); - { - auto users = src.getUsers(); - std::vector usersVec(users.begin(), users.end()); - for (auto user : usersVec) { - for (size_t j = 0; j < user->getNumOperands(); j++) { - if (user->getOperand(j) == src) { - user->setOperand(j, gpuMemcpyOp.getOperand(1)); - } - } - } - } - { - auto users = dst.getUsers(); - std::vector usersVec(users.begin(), users.end()); - for (auto user : usersVec) { - for (size_t j = 0; j < user->getNumOperands(); j++) { - if (user->getOperand(j) == src) { - user->setOperand(j, gpuMemcpyOp.getOperand(0)); - } - } - } - } + src.replaceAllUsesWith(gpuMemcpyOp->getResult(1)); + dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0)); copyOp->erase(); } // Allocate space on GPU and copy global memrefs to GPU, needs deallocation @@ -206,47 +170,34 @@ void ConvertMemcpyToGPUPass::runOnOperation() { auto memrefType = dyn_cast(result.getType()); auto gpuAllocOp = builder.create( getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({})); - unDeallocatedOperations.insert(&gpuAllocOp); + unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); + auto src = result; auto dst = gpuAllocOp->getResult(0); auto gpuMemcpyOp = builder.create( gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src); - { - auto users = src.getUsers(); - std::vector usersVec(users.begin(), users.end()); - for (auto user : usersVec) { - if (isa(user)) - continue; - // TODO: replace with src.replaceAllUsesExcept() - for (size_t j = 0; j < user->getNumOperands(); j++) { - if (user->getOperand(j) == src) { - user->setOperand(j, dst); - } - } - } - } + src.replaceAllUsesExcept(dst, gpuMemcpyOp); } // Copy data back to CPU, deallocate GPU, then return else if (auto returnOp = dyn_cast(nestedOp)) { builder.setInsertionPoint(returnOp); - - for (auto *gpuAllocOp : unDeallocatedOperations) { - auto gpuDeallocOp = builder.create( - builder.getUnknownLoc(), TypeRange(), ValueRange(), - gpuAllocOp->getResult(0)); - } - builder.setInsertionPoint(returnOp); for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) { auto val = returnOp->getOperand(i); - auto memRefType = dyn_cast(val.getType()); - auto allocOp = builder.create(builder.getUnknownLoc(), - memRefType); - auto gpuMemcpyOp = builder.create( - allocOp.getLoc(), TypeRange(), ValueRange(), allocOp->getResult(0), - val); - auto gpuDeallocOp = builder.create( - gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val); - returnOp->setOperand(i, allocOp->getResult(0)); + if (auto memrefType = dyn_cast(val.getType())) { + auto allocOp = + builder.create(returnOp->getLoc(), memrefType); + builder.create(allocOp.getLoc(), TypeRange(), + ValueRange(), allocOp->getResult(0), + val); + // FIXME: may be leak memory + // auto gpuDeallocOp = builder.create( + // gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val); + returnOp->setOperand(i, allocOp->getResult(0)); + } + } + for (auto value : unDeallocatedValue) { + builder.create(returnOp->getLoc(), TypeRange(), + ValueRange(), value); } } return WalkResult::advance(); @@ -260,4 +211,4 @@ void registerConvertMemcpyToGPUPass() { PassRegistration(); } } // namespace buddy -} // namespace mlir +} // namespace mlir \ No newline at end of file diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir index 63edfd8d02..f616127930 100644 --- a/tests/Conversion/convert-memcpy-to-gpu.mlir +++ b/tests/Conversion/convert-memcpy-to-gpu.mlir @@ -1,23 +1,214 @@ -// RUN: buddy-opt -convert-memcpy-to-gpu -canonicalize %s | FileCheck %s - -// CHECK: %memref = gpu.alloc () : memref<32x32xf32> -// CHECK: %memref_0 = gpu.alloc () : memref<32x32xf32> -// CHECK: gpu.dealloc %memref : memref<32x32xf32> -// CHECK: %alloc = memref.alloc() : memref<32x32xf32> -// CHECK: gpu.memcpy %alloc, %memref_0 : memref<32x32xf32>, memref<32x32xf32> -// CHECK: gpu.dealloc %memref_0 : memref<32x32xf32> -module attributes {gpu.container_module} { - func.func @matmul(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) -> memref<32x32xf32> { - %c2 = arith.constant 2 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %alloc = memref.alloc() {alignment = 64 : i64} : memref<32x32xf32> - gpu.launch_func @matmul_kernel::@matmul_kernel blocks in (%c1, %c1, %c1) threads in (%c64, %c2, %c1) - return %alloc : memref<32x32xf32> +//===- ConvertMemcpyToGPU.cpp ---------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the pass that converts memcpy to gpu operations. +// +//===---------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Support/LLVM.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace mlir; +using namespace vector; + +//===----------------------------------------------------------------------===// +// ConvertMemcpyToGPUPass +//===----------------------------------------------------------------------===// + +namespace { + +class ConvertMemcpyToGPUPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass) + StringRef getArgument() const final { return "convert-memcpy-to-gpu"; } + StringRef getDescription() const final { + return "Convert memref opertaions to gpu operations."; + } + ConvertMemcpyToGPUPass() = default; + ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass &) {} + + Option processArgs{ + *this, "process-args", + llvm::cl::desc("Whether the pass processes the input args."), + llvm::cl::init(true)}; + + void runOnOperation() override; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); } - gpu.module @matmul_kernel { - gpu.func @matmul_kernel() kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { - gpu.return +}; + +void ConvertMemcpyToGPUPass::runOnOperation() { + auto funcOp = getOperation(); + + if (funcOp.isDeclaration() || funcOp.isExternal()) + return; + + // Make sure the gpu function is already outlined. + funcOp->walk([&](Operation *nestedOp) { + if (auto gpuLaunchOp = dyn_cast(nestedOp)) { + nestedOp->emitOpError("The gpu function should be outlined."); + } + return WalkResult::advance(); + }); + + std::vector unDeallocatedValue; + OpBuilder builder(funcOp->getContext()); + + // Copy all function arguments to gpu, needs deallocation + if (processArgs) { + builder.setInsertionPointToStart(&(funcOp.getBody().front())); + unsigned numArgs = funcOp.getNumArguments(); + for (unsigned i = 0; i < numArgs; ++i) { + BlockArgument arg = funcOp.getArgument(i); + // Create a gpu.alloc op, then copy memory to it + // TODO: Move this out of operation, make the copy process async + auto memrefType = dyn_cast(arg.getType()); + auto gpuAllocOp = builder.create( + builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({})); + unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); + auto gpuMemcpyOp = builder.create( + gpuAllocOp.getLoc(), TypeRange(), ValueRange(), + gpuAllocOp.getResult(0), arg); + arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp); } } + + funcOp->walk([&](Operation *nestedOp) { + // Replace all allocations with GPU.alloc + if (auto allocOp = dyn_cast(nestedOp)) { + // Rewrite this allocOp to gpu.alloc, change for all users + builder.setInsertionPointAfter(allocOp); + auto result = allocOp->getResult(0); + auto memrefType = dyn_cast(result.getType()); + auto memorySpace = memrefType.getMemorySpace(); + + // Filter operations. + if (memorySpace) { + if (auto intMemorySpace = llvm::dyn_cast(memorySpace)) { + if (intMemorySpace.getInt() != 0) { + return WalkResult::advance(); + } + } else if (auto gpuMemorySpace = + llvm::dyn_cast(memorySpace)) { + if (gpuMemorySpace.getValue() != gpu::AddressSpace::Global) { + return WalkResult::advance(); + } + } else + return WalkResult::advance(); + } + + auto gpuAllocOp = builder.create( + allocOp->getLoc(), TypeRange({memrefType}), ValueRange({})); + + for (auto user : llvm::make_early_inc_range(result.getUsers())) { + if (auto deallocOp = dyn_cast(user)) { + builder.setInsertionPointAfter(deallocOp); + builder.create(deallocOp->getLoc(), TypeRange(), + ValueRange(), gpuAllocOp.getResult(0)); + deallocOp->erase(); + } else { + for (auto &opOperand : user->getOpOperands()) { + if (opOperand.is(result)) { + opOperand.set(gpuAllocOp.getResult(0)); + } + } + } + } + allocOp->erase(); + } + // Replace all memory.copy operations with gpu.memcpy + else if (auto copyOp = dyn_cast(nestedOp)) { + auto src = copyOp.getOperand(0); + auto dst = copyOp.getOperand(1); + // Notice: GPU.memcpy has a different src dst order + builder.setInsertionPointAfter(copyOp); + auto gpuMemcpyOp = builder.create( + copyOp->getLoc(), TypeRange(), ValueRange(), dst, src); + src.replaceAllUsesWith(gpuMemcpyOp->getResult(1)); + dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0)); + copyOp->erase(); + } + // Allocate space on GPU and copy global memrefs to GPU, needs deallocation + else if (auto getGlobalOp = dyn_cast(nestedOp)) { + builder.setInsertionPointAfter(getGlobalOp); + auto result = getGlobalOp->getResult(0); + auto memrefType = dyn_cast(result.getType()); + auto gpuAllocOp = builder.create( + getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({})); + unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); + + auto src = result; + auto dst = gpuAllocOp->getResult(0); + auto gpuMemcpyOp = builder.create( + gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src); + src.replaceAllUsesExcept(dst, gpuMemcpyOp); + } + // Copy data back to CPU, deallocate GPU, then return + else if (auto returnOp = dyn_cast(nestedOp)) { + builder.setInsertionPoint(returnOp); + for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) { + auto val = returnOp->getOperand(i); + if (auto memrefType = dyn_cast(val.getType())) { + auto allocOp = + builder.create(returnOp->getLoc(), memrefType); + builder.create(allocOp.getLoc(), TypeRange(), + ValueRange(), allocOp->getResult(0), + val); + // FIXME: may be leak memory + // auto gpuDeallocOp = builder.create( + // gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val); + returnOp->setOperand(i, allocOp->getResult(0)); + } + } + for (auto value : unDeallocatedValue) { + builder.create(returnOp->getLoc(), TypeRange(), + ValueRange(), value); + } + } + return WalkResult::advance(); + }); +} +} // end anonymous namespace. + +namespace mlir { +namespace buddy { +void registerConvertMemcpyToGPUPass() { + PassRegistration(); } +} // namespace buddy +} // namespace mlir \ No newline at end of file From 53d69d61797c402b52474765a90f62d98bd04bd1 Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Tue, 29 Oct 2024 06:42:10 +0000 Subject: [PATCH 24/29] CPU, GPU, Custom --- examples/BuddyLeNet/CMakeLists.txt | 78 ++++++---- examples/BuddyLeNet/buddy-lenet-import.py | 10 +- frontend/Python/graph/graph.py | 142 +++++++----------- frontend/Python/graph/graph_driver.py | 49 +++--- frontend/Python/graph/operation.py | 2 +- frontend/Python/graph/transform/__init__.py | 2 +- frontend/Python/graph/transform/fuse_ops.py | 52 ++++++- .../graph/transform/useless_op_eliminate.py | 2 +- frontend/Python/ops/tosa.py | 1 - frontend/Python/ops/utils.py | 13 -- 10 files changed, 191 insertions(+), 160 deletions(-) diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index 6e9cfe1204..5935ad50c5 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -49,8 +49,54 @@ add_custom_command( DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir COMMENT "Building subgraph0.o" VERBATIM) - -# new + +set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map") +set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin") +# add_custom_command( +# OUTPUT subgraph0.o +# COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt +# -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION} +# -buffer-deallocation +# -convert-linalg-to-parallel-loops +# -canonicalize +# -gpu-map-parallel-loops +# -convert-parallel-loops-to-gpu +# -gpu-kernel-outlining +# -canonicalize +# -cse | +# ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} | +# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | +# ${LLVM_TOOLS_BINARY_DIR}/llvm-as | +# ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o +# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir +# COMMENT "Building subgraph0.o" +# VERBATIM) + +add_custom_command( + OUTPUT subgraph1.o + COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | + ${LLVM_TOOLS_BINARY_DIR}/mlir-opt + -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION} + -buffer-deallocation + -convert-linalg-to-parallel-loops + -canonicalize + -gpu-map-parallel-loops + -convert-parallel-loops-to-gpu + -gpu-kernel-outlining + -canonicalize + -cse | + ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize | + ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} | + ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_TOOLS_BINARY_DIR}/llvm-as | + ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o + DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir + COMMENT "Building subgraph1.o" + VERBATIM) set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map") set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin") add_custom_command( @@ -75,34 +121,6 @@ add_custom_command( DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir COMMENT "Building subgraph1.o" VERBATIM) - -# add_library(LENET_GPU STATIC subgraph0_gpu.o forward.o) - -# SET_TARGET_PROPERTIES(LENET_GPU PROPERTIES LINKER_LANGUAGE C) - -# add_executable(buddy-lenet-run-gpu buddy-lenet-main.cpp) -# target_link_directories(buddy-lenet-run-gpu PRIVATE ${LLVM_LIBRARY_DIR}) - -# set(BUDDY_LENET_LIBS_GPU LENET_GPU mlir_c_runner_utils mlir_async_runtime mlir_runner_utils mlir_cuda_runtime ${PNG_LIBRARIES}) - -# target_link_libraries(buddy-lenet-run-gpu ${BUDDY_LENET_LIBS_GPU}) - -# add_custom_command( -# OUTPUT subgraph1.ll -# COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -gpu-kernel-outlining -llvm-request-c-wrappers -convert-vector-to-scf -convert-vector-to-llvm -func-bufferize-dynamic-offset -tensor-bufferize -buffer-deallocation -finalizing-bufferize -expand-strided-metadata -one-shot-bufferize | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -# -pass-pipeline "builtin.module(nvvm-attach-target{chip=sm_75 O=3}, gpu.module(convert-scf-to-cf, convert-gpu-to-nvvm, convert-arith-to-llvm), convert-scf-to-cf, gpu-to-llvm, reconcile-unrealized-casts, gpu-module-to-binary)" | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir -# COMMENT "Building subgraph1.ll" -# VERBATIM) - -# add_custom_command( -# OUTPUT subgraph1.o -# COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang++ ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -L/usr/local/cuda/lib64 -lcudart -O3 -c -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.o -# DEPENDS ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph1.ll -# COMMENT "Building subgraph1.o" -# VERBATIM) add_library(LENET STATIC subgraph0.o subgraph1.o forward.o) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index c878b3b163..4acd548038 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -27,7 +27,7 @@ from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.graph import GraphDriver -from buddy.compiler.graph.transform import simply_fuse +from buddy.compiler.graph.transform import cpu_fuse, gpu_fuse, custom_partition from buddy.compiler.graph.type import DeviceType from buddy.compiler.ops import tosa, gpu from buddy.compiler.graph.json_decoder import json_to_graph @@ -58,7 +58,7 @@ assert len(graphs) == 1 graph = graphs[0] params = dynamo_compiler.imported_params[graph] -pattern_list = [simply_fuse] +pattern_list = [custom_partition] graph.fuse_ops(pattern_list) path_prefix = os.path.dirname(os.path.abspath(__file__)) @@ -71,10 +71,10 @@ graph0 = json_to_graph(json_str) driver = GraphDriver(graph0) driver.subgraphs[0].lower_to_top_level_ir() -driver.subgraphs[1].lower_to_top_level_ir() - with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) +# Add heterogeneous hardware partition +driver.subgraphs[1].lower_to_top_level_ir() with open(os.path.join(path_prefix, "subgraph1.mlir"), "w") as module_file: print(driver.subgraphs[1]._imported_module, file=module_file) with open(os.path.join(path_prefix, "forward.mlir"), "w") as module_file: @@ -103,4 +103,4 @@ # # Convert the lenet graph to DOT string # dot_str = graph.to_dot() # with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file: -# module_file.write(dot_str) \ No newline at end of file +# module_file.write(dot_str) diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index c7239a0d7d..5ddbbe8328 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -109,7 +109,7 @@ def __init__( ops_registry: dict, func_name: str, device: DeviceType = DeviceType.CPU, - verbose=False + verbose=False, ) -> None: """ Initializes the Graph. @@ -175,26 +175,14 @@ def init_op_group(self): Returns: - None """ - # for i, op in enumerate(self._body): - # if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp): - # continue - # group = [op] - # subgraph_name = "subgraph{}".format(i) - # self.group_map_device[subgraph_name] = DeviceType.CPU - # self.op_groups[subgraph_name] = group group = [] for i, op in enumerate(self._body): - if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25: + if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp): continue - group.append(op) - subgraph_name = "subgraph1" - self.group_map_device[subgraph_name] = DeviceType.CPU - self.op_groups[subgraph_name] = group - - new_group = [self._body[25]] - subgraph_name = "subgraph0" - self.group_map_device[subgraph_name] = DeviceType.GPU - self.op_groups[subgraph_name] = new_group + group = [op] + subgraph_name = "subgraph{}".format(i) + self.group_map_device[subgraph_name] = DeviceType.CPU + self.op_groups[subgraph_name] = group def fuse_ops(self, pattern_list: List[FunctionType]): """ @@ -214,9 +202,9 @@ def fuse_ops(self, pattern_list: List[FunctionType]): # Initialize operation groups self.init_op_group() - # # Apply fusion patterns - # for pattern_func in pattern_list: - # pattern_func(self) + # Apply fusion patterns + for pattern_func in pattern_list: + pattern_func(self) def perform(self, func_list: List[FunctionType]): """ @@ -258,7 +246,7 @@ def lower_to_top_level_ir(self): self._ops_registry, False, self.device, - verbose=self._verbose + verbose=self._verbose, ) self._imported_module = fx_importer.import_graph() outputs = fx_importer.get_output_nodes() @@ -353,7 +341,7 @@ def to_dot(self): Returns: str: A DOT string representing the buddy graph for visualization. """ - dot = graphviz.Digraph(comment='Buddy Graph') + dot = graphviz.Digraph(comment="Buddy Graph") for op in self._body: # if isinstance(op, PlaceholderOp): # continue @@ -361,14 +349,23 @@ def to_dot(self): dot.edge(op._name, child) for op in self._body: if isinstance(op, PlaceholderOp): - dot.node(op._name, shape="ellipse", fillcolor="white", style="filled") + dot.node( + op._name, shape="ellipse", fillcolor="white", style="filled" + ) # continue elif isinstance(op, OutputOp): - dot.node(op._name, shape="ellipse", fillcolor="white", style="filled") + dot.node( + op._name, shape="ellipse", fillcolor="white", style="filled" + ) elif isinstance(op, MaxPool2dOp): dot.node(op._name, shape="box", fillcolor="red", style="filled") else: - dot.node(op._name, shape="box", fillcolor="deepskyblue", style="filled") + dot.node( + op._name, + shape="box", + fillcolor="deepskyblue", + style="filled", + ) return str(dot) def to_json(self): @@ -380,7 +377,7 @@ def to_json(self): """ json_str = json.dumps(self, cls=BuddyGraphEncoder) return json_str - + class BuddyGraphEncoder(json.JSONEncoder): """ @@ -392,36 +389,36 @@ class BuddyGraphEncoder(json.JSONEncoder): Returns: JSONEncoder: A JSON encoder instance for Buddy Graph objects. """ + def default(self, obj): if isinstance(obj, Graph): node_map_device = {} for subgraph_name, ops in obj.op_groups.items(): for op in ops: - node_map_device[op.name] = obj.group_map_device[subgraph_name] + node_map_device[op.name] = obj.group_map_device[ + subgraph_name + ] return { - 'graph_name' : obj._func_name, - 'nodes' : obj._body, - 'device' : obj.device, - 'params' : obj._fake_params, - 'inputs' : obj._inputs, - 'node_map_device' : node_map_device + "graph_name": obj._func_name, + "nodes": obj._body, + "device": obj.device, + "params": obj._fake_params, + "inputs": obj._inputs, + "node_map_device": node_map_device, } elif isinstance(obj, Op): return { - 'name' : obj._name, - 'children' : obj._children, - 'parents' : obj._parents, - 'arguments' : obj._arguments, - 'keyword_arguments' : obj._keyword_arguments, - 'tensor_meta' : obj._tensor_meta, - 'type' : obj._op_type, - 'class' : obj.__class__.__name__ + "name": obj._name, + "children": obj._children, + "parents": obj._parents, + "arguments": obj._arguments, + "keyword_arguments": obj._keyword_arguments, + "tensor_meta": obj._tensor_meta, + "type": obj._op_type, + "class": obj.__class__.__name__, } elif isinstance(obj, TensorMeta): - return { - 'shape' : obj.shape, - 'dtype' : obj.dtype - } + return {"shape": obj.shape, "dtype": obj.dtype} elif isinstance(obj, OpType): return obj._name_ elif isinstance(obj, TensorDType): @@ -431,6 +428,7 @@ def default(self, obj): else: return super().default(obj) + class GraphImporter: """ Imports an buddy graph and generates an MLIR module in high-level dialects. @@ -454,7 +452,7 @@ def __init__( ops_registry: dict, do_param_pack: bool = False, device: DeviceType = DeviceType.CPU, - verbose=False + verbose=False, ): """ Initializes the buddy Graph importer. @@ -572,40 +570,32 @@ def generated_func(*args): self._symbol_table.get((str(output_arg), 0)) for output_arg in output_node_args ] - # if self._device == DeviceType.GPU: - # returns = [ - # buffer.to_tensor(ret) - # for ret in returns - # ] self._symbol_table[("output", 0)] = returns elif isinstance(node, PlaceholderOp): self._import_placeholder(node, args_list) elif isinstance(node, GetItemOp): - self._symbol_table[ - (str(node.name), 0) - ] = self._symbol_table[ - (str(node.args[0]), node.args[1]) - ] + self._symbol_table[(str(node.name), 0)] = ( + self._symbol_table[ + (str(node.args[0]), node.args[1]) + ] + ) else: self._import_op(node) new_ops = [op for op in func_op.body.blocks[0].operations] if self._verbose: - print('='*20 + "Graph Node" + "="*20) + print("=" * 20 + "Graph Node" + "=" * 20) print("Node: " + node.name) print("Type: " + str(node._op_type)) print("Arguments: " + str(node.args)) print("Parents: " + str(node._parents)) print("Children: " + str(node._children)) - print('-'*20 + "MLIR OPS" + '-'*20) + print("-" * 20 + "MLIR OPS" + "-" * 20) for op in new_ops: if op not in old_ops: print(op) print("") - + return self._symbol_table.get(("output", 0)) - - # if self._device == DeviceType.GPU: - # self._module.operation.attributes["gpu.container_module"] = ir.UnitAttr.get() return self._module @@ -653,11 +643,11 @@ def generated_func(*args): elif isinstance(node, PlaceholderOp): self._import_placeholder(node, args_list) elif isinstance(node, GetItemOp): - self._symbol_table[ - (str(node.name), 0) - ] = self._symbol_table[ - (str(node.args[0]), node.args[1]) - ] + self._symbol_table[(str(node.name), 0)] = ( + self._symbol_table[ + (str(node.args[0]), node.args[1]) + ] + ) else: self._import_op(node) @@ -706,16 +696,6 @@ def _import_placeholder( else: placeholder_name = args_list[self._num_input_visited] - # TODO : Consider converting arg type from RankedTensorType to MemRefType - # if self._device == DeviceType.GPU: - # placeholder_name = buffer.to_memref( - # ir.MemRefType.get( - # list(node.tensor_meta.shape), - # self._str_to_mlir_dtype(node.tensor_meta.dtype) - # ), - # placeholder_name - # ) - self._symbol_table[(str(node.name), 0)] = placeholder_name self._num_input_visited += 1 @@ -727,16 +707,10 @@ def _import_op(self, node: Op): node (Op): The buddy node representing the operation. """ - op_name = node.__class__.__name__ - # if self._device == DeviceType.CPU: op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( self._ops_registry[op_name](node, self._symbol_table) ) - # else: - # op_ret: ir.Operation | ir.Value | tuple | List | ir.OpResult = ( - # self._ops_gpu_registry[op_name](node, self._symbol_table) - # ) if isinstance(op_ret, tuple | List): for i, operation in enumerate(op_ret): if isinstance(operation, ir.Operation) or isinstance( diff --git a/frontend/Python/graph/graph_driver.py b/frontend/Python/graph/graph_driver.py index 58e7766cb1..013a9f6e0b 100644 --- a/frontend/Python/graph/graph_driver.py +++ b/frontend/Python/graph/graph_driver.py @@ -41,6 +41,7 @@ class GraphDriver: - _subgraphs_outputs (dict): A dictionary mapping subgraph names to their output op's result. """ + def __init__(self, graph: Graph) -> None: """ Initialize the GraphDriver object with a given computational graph. @@ -53,9 +54,9 @@ def __init__(self, graph: Graph) -> None: - None """ self._graph = graph - self._subgraph_dependencies = { - subgraph_name : set() - for subgraph_name in list(self._graph.op_groups.keys()) + self._subgraph_dependencies = { + subgraph_name: set() + for subgraph_name in list(self._graph.op_groups.keys()) } self._call_table = {} ( @@ -100,7 +101,7 @@ def build_subgraph_by_group(self): if isinstance(node, OutputOp): for arg in node.args: output_node.append(arg) - + # Identify outputs for each subgraph and build dependencies between subgraphs for subgraph_name in self._graph.op_groups.keys(): subgraphs_outputs[subgraph_name] = [] @@ -135,11 +136,11 @@ def build_subgraph_by_group(self): if inp in node._parents: placeholder_node.add_children(op.name) subgraph_body.append(placeholder_node) - + # Add operations to subgraph body for op in self._graph.op_groups[subgraph_name]: subgraph_body.append(op) - + # Construct output node output_node = OutputOp() output_node.name = "output" @@ -151,11 +152,11 @@ def build_subgraph_by_group(self): # Create subgraph and add it to the dictionary subgraph = Graph( subgraph_input, - [], - self._graph._ops_registry, + [], + self._graph._ops_registry, subgraph_name, - subgraph_device, - verbose=self._graph._verbose + subgraph_device, + verbose=self._graph._verbose, ) subgraph.body = subgraph_body for op in subgraph_body: @@ -176,12 +177,14 @@ def topological_sort_subgraph(self): """ # Calculate in degree of each subgraph - in_degree = { subgraph_name : 0 for subgraph_name in list(self._subgraphs.keys()) } + in_degree = { + subgraph_name: 0 for subgraph_name in list(self._subgraphs.keys()) + } for src, dests in self._subgraph_dependencies.items(): for dest in dests: in_degree[dest] += 1 - # Topological sorting + # Topological sorting queue = deque([node for node in in_degree if in_degree[node] == 0]) topo_order = [] @@ -194,7 +197,11 @@ def topological_sort_subgraph(self): queue.append(child) # TODO: If the custom subgraph partitioning is illegal, further partition the subgraph to make it valid. - return topo_order if len(topo_order) == len(list(self._subgraphs.keys())) else None + return ( + topo_order + if len(topo_order) == len(list(self._subgraphs.keys())) + else None + ) def construct_main_graph(self, do_param_pack=False): """ @@ -217,7 +224,7 @@ def construct_main_graph(self, do_param_pack=False): self._graph._fake_params, self._graph._ops_registry, self._graph._func_name, - self._graph._verbose + self._graph._verbose, ) # Adding FuncOp nodes for each subgraph @@ -235,18 +242,18 @@ def construct_main_graph(self, do_param_pack=False): self._graph.node_table[output].tensor_meta["dtype"] ) main_graph.add_node(func_node) - + # Adding placeholder operations from the original graph for op in self._graph.body: if isinstance(op, PlaceholderOp): main_graph.add_node(op) - + # Analysis topology order to sort subgraph call. topo_order = self.topological_sort_subgraph() - if topo_order == None: - print('Error : Graph Partitioning is illegal!') + if topo_order == None: + print("Error : Graph Partitioning is illegal!") return None - + # Adding CallOp to invoke the single subgraph for i, subgraph_name in enumerate(topo_order): call_node = CallOp() @@ -261,7 +268,7 @@ def construct_main_graph(self, do_param_pack=False): if inp in value: call_node.add_argument( arg=self._call_table[key].name, - arg_index=value.index(inp) + arg_index=value.index(inp), ) break for output in self._subgraphs_outputs[subgraph_name]: @@ -283,7 +290,7 @@ def construct_main_graph(self, do_param_pack=False): getitem_node.name = "getitem{}".format(i) output_node.add_argument(getitem_node.name) main_graph.add_node(getitem_node) - + # Marking the final output of the main graph output_node.name = "output" main_graph.add_node(output_node) diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py index fde8809fd6..0ec7930c25 100644 --- a/frontend/Python/graph/operation.py +++ b/frontend/Python/graph/operation.py @@ -126,7 +126,7 @@ def args(self): @property def kwargs(self): return self._keyword_arguments - + @property def parents(self): return self._parents diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py index d91e0d06b2..427d266b95 100644 --- a/frontend/Python/graph/transform/__init__.py +++ b/frontend/Python/graph/transform/__init__.py @@ -18,5 +18,5 @@ # # ===--------------------------------------------------------------------------- -from .fuse_ops import simply_fuse +from .fuse_ops import cpu_fuse, gpu_fuse, custom_partition from .useless_op_eliminate import maxpool2d_simplify diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py index feb697930a..e0ff806f52 100644 --- a/frontend/Python/graph/transform/fuse_ops.py +++ b/frontend/Python/graph/transform/fuse_ops.py @@ -26,11 +26,33 @@ # OP_TYPE_FUSABLE = [OpType.BroadcastType, OpType.ElementwiseType, OpType.ReshapeType] # OP_TYPE_UNFUSABLE = [OpType.Unfusable, OpType.ConcatType] # OP_TYPE_FUSABLE_BY_SPECIFIC_PASS = [] -# ANCHOR_OP_TYPE = [] +# ANCHOR_OP_TYPE = [] -def simply_fuse(graph: Graph): + +def cpu_fuse(graph: Graph): + """ + Function to fuse all operations into one graph. Set the device type to CPU. + + Args: + - graph (Graph): The input graph to be simplified. + + Returns: + - None: Modifies the input graph in place. + """ + new_op_group = [] + device = DeviceType.CPU + for op in graph.body: + if isinstance(op, PlaceholderOp): + continue + new_op_group.append(op) + graph.op_groups = {} + graph.op_groups["subgraph0"] = new_op_group + graph.group_map_device = {"subgraph0": device} + + +def gpu_fuse(graph: Graph): """ - Function to fuse all operations into one graph. + Function to fuse all operations into one graph. Set the device type to GPU. Args: - graph (Graph): The input graph to be simplified. @@ -47,3 +69,27 @@ def simply_fuse(graph: Graph): graph.op_groups = {} graph.op_groups["subgraph0"] = new_op_group graph.group_map_device = {"subgraph0": device} + + +def custom_partition(graph: Graph): + """ + Function to custom subgraph partition. + + Args: + - graph (Graph): The input graph to be simplified. + + Returns: + - None: Modifies the input graph in place. + """ + group = [] + for i, op in enumerate(graph._body): + if isinstance(op, PlaceholderOp) or isinstance(op, OutputOp) or i == 25: + continue + group.append(op) + subgraph_name = "subgraph1" + graph.group_map_device[subgraph_name] = DeviceType.CPU + graph.op_groups[subgraph_name] = group + new_group = [graph._body[25]] + subgraph_name = "subgraph0" + graph.group_map_device[subgraph_name] = DeviceType.GPU + graph.op_groups[subgraph_name] = new_group diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py index 2522e17984..0d176be2df 100644 --- a/frontend/Python/graph/transform/useless_op_eliminate.py +++ b/frontend/Python/graph/transform/useless_op_eliminate.py @@ -74,4 +74,4 @@ def maxpool2d_simplify(graph: Graph): for j, op in enumerate(graph.body): if op == getitem_node: graph.body[j] = new_node - break \ No newline at end of file + break diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py index 3597810e4a..797fdfd6d2 100644 --- a/frontend/Python/ops/tosa.py +++ b/frontend/Python/ops/tosa.py @@ -1254,7 +1254,6 @@ def convolution2d_op(node: Conv2dOp, symbol_table): return op - def relu_op(node: ReluOp, symbol_table): """ Import the tensor relu operation. diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py index 012340d475..dad07bd68c 100644 --- a/frontend/Python/ops/utils.py +++ b/frontend/Python/ops/utils.py @@ -53,16 +53,3 @@ def mlir_element_attr_get(type_name, value): return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value) case TensorDType.Bool: return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value) - - -def tensor_shape_size(shape): - """ - Calculate the product of all dimensions in the given shape list, - which represents the size of the tensor. - Args: - shape: A list containing the sizes of each dimension of the tensor. - """ - size = 1 - for dim in shape: - size *= dim - return size From 2488faf07bfdb13bc2c0131caff8447146a6aee5 Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Tue, 29 Oct 2024 07:26:34 +0000 Subject: [PATCH 25/29] temp --- .gitignore | 2 - examples/BuddyLeNet/buddy-lenet-import.py | 15 -- frontend/Python/graph/graph.py | 3 - frontend/Python/ops/utils.py | 1 + tests/Conversion/convert-memcpy-to-gpu.mlir | 275 +++++--------------- 5 files changed, 66 insertions(+), 230 deletions(-) diff --git a/.gitignore b/.gitignore index 8fcf8e4b1e..1ffba60cbc 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,5 @@ # Clangd cache .cache -# environment bash -env.sh # Clangd configurations .clangd \ No newline at end of file diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index 4acd548038..aec4e5e561 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -89,18 +89,3 @@ float32_param.tofile(Path(current_path) / "arg0.data") -# # Convert the lenet graph to JSON string -# json_str = graph.to_json() -# with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file: -# module_file.write(json_str) - -# # Convert the lenet graph Json string to a lenet graph -# graph0 = json_to_graph(json_str) -# graph0.lower_to_top_level_ir() -# with open(os.path.join(path_prefix, "lenet.mlir"), "w") as module_file: -# print(graph0._imported_module, file=module_file) - -# # Convert the lenet graph to DOT string -# dot_str = graph.to_dot() -# with open(os.path.join(path_prefix, "graph.dot"), "w") as module_file: -# module_file.write(dot_str) diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 5ddbbe8328..3283beacee 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -343,8 +343,6 @@ def to_dot(self): """ dot = graphviz.Digraph(comment="Buddy Graph") for op in self._body: - # if isinstance(op, PlaceholderOp): - # continue for child in op._children: dot.edge(op._name, child) for op in self._body: @@ -352,7 +350,6 @@ def to_dot(self): dot.node( op._name, shape="ellipse", fillcolor="white", style="filled" ) - # continue elif isinstance(op, OutputOp): dot.node( op._name, shape="ellipse", fillcolor="white", style="filled" diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py index dad07bd68c..337f5a6b49 100644 --- a/frontend/Python/ops/utils.py +++ b/frontend/Python/ops/utils.py @@ -53,3 +53,4 @@ def mlir_element_attr_get(type_name, value): return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value) case TensorDType.Bool: return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value) + diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir index f616127930..573000a4b5 100644 --- a/tests/Conversion/convert-memcpy-to-gpu.mlir +++ b/tests/Conversion/convert-memcpy-to-gpu.mlir @@ -1,214 +1,69 @@ -//===- ConvertMemcpyToGPU.cpp ---------------------------------------------===// -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -// -// This file implements the pass that converts memcpy to gpu operations. -// -//===---------------------------------------------------------------------===// - -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/OperationSupport.h" -#include "mlir/IR/TypeRange.h" -#include "mlir/IR/ValueRange.h" -#include "mlir/IR/Visitors.h" -#include "mlir/Support/LLVM.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace mlir; -using namespace vector; - -//===----------------------------------------------------------------------===// -// ConvertMemcpyToGPUPass -//===----------------------------------------------------------------------===// - -namespace { - -class ConvertMemcpyToGPUPass - : public PassWrapper> { -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertMemcpyToGPUPass) - StringRef getArgument() const final { return "convert-memcpy-to-gpu"; } - StringRef getDescription() const final { - return "Convert memref opertaions to gpu operations."; +// RUN: buddy-opt -convert-memcpy-to-gpu="process-args=1" %s | FileCheck %s + +#map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)> +module attributes {gpu.container_module} { + memref.global "private" constant @__constant_1x10x10xf32 : memref<1x10x10xf32> = dense<1.000000e+00> {alignment = 64 : i64} + func.func @matmul(%arg0: memref<1x10x10xf32>, %arg1: memref<1x10x10xf32>) -> memref<1x10x10xf32> { + // CHECK: %[[d_arg0:.*]] = gpu.alloc () : memref<1x10x10xf32> + // CHECK-NEXT: gpu.memcpy %[[d_arg0]], %arg0 : memref<1x10x10xf32>, memref<1x10x10xf32> + // CHECK: %[[d_arg1:.*]] = gpu.alloc () : memref<1x10x10xf32> + // CHECK-NEXT: gpu.memcpy %[[d_arg1:.*]], %arg1 : memref<1x10x10xf32>, memref<1x10x10xf32> + %c10 = arith.constant 10 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + // CHECK: %[[h_global_data:.*]] = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32> + // CHECK: %[[d_global_data:.*]] = gpu.alloc () : memref<1x10x10xf32> + // CHECK: gpu.memcpy %[[d_global_data]], %[[h_global_data]] : memref<1x10x10xf32>, memref<1x10x10xf32> + %0 = memref.get_global @__constant_1x10x10xf32 : memref<1x10x10xf32> + // CHECK: %[[d_alloc0:.*]] = gpu.alloc () : memref<1x10x10xf32> + %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32> + // CHECK: gpu.launch_func + gpu.launch_func @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %cst : f32, %alloc : memref<1x10x10xf32>) + // CHECK: gpu.launch_func + // CHECK-SAME: %[[d_arg0]] + // CHECK-SAME: %[[d_arg1]] + // CHECK-SAME: %[[d_alloc0]] + gpu.launch_func @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<1x10x10xf32>, %arg1 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %c10 : index) + // CHECK: %[[d_alloc1:.*]] = gpu.alloc () : memref<1x10x10xf32> + %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32> + // CHECK: gpu.launch_func + gpu.launch_func @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %cst : f32, %alloc_0 : memref<1x10x10xf32>) + // CHECK: gpu.launch_func + // CHECK-SAME: %[[d_global_data]] + // CHECK-SAME: %[[d_alloc0]] + // CHECK-SAME: %[[d_alloc1]] + gpu.launch_func @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %0 : memref<1x10x10xf32>, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %c10 : index) + // CHECK: %[[d_result:.*]] = gpu.alloc () : memref<1x10x10xf32> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10xf32> + // CHECK: gpu.launch_func + gpu.launch_func @kernel::@fill blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %cst : f32, %alloc_1 : memref<1x10x10xf32>) + // CHECK: gpu.launch_func + // CHECK-SAME: %[[d_alloc0]] + // CHECK-SAME: %[[d_alloc1]] + // CHECK-SAME: %[[d_result]] + gpu.launch_func @kernel::@matmul blocks in (%c10, %c10, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %alloc : memref<1x10x10xf32>, %alloc_0 : memref<1x10x10xf32>, %alloc_1 : memref<1x10x10xf32>, %c10 : index) + // CHECK: gpu.dealloc %[[d_alloc1]] : memref<1x10x10xf32> + memref.dealloc %alloc_0 : memref<1x10x10xf32> + // CHECK: gpu.dealloc %[[d_alloc0]] : memref<1x10x10xf32> + memref.dealloc %alloc : memref<1x10x10xf32> + + // CHECK: %[[h_alloc:.*]] = memref.alloc() : memref<1x10x10xf32> + // CHECK-NEXT: gpu.memcpy %[[h_alloc]], %[[d_result]] : memref<1x10x10xf32>, memref<1x10x10xf32> + + // CHECK: gpu.dealloc %[[d_arg0]] : memref<1x10x10xf32> + // CHECK: gpu.dealloc %[[d_arg1]] : memref<1x10x10xf32> + // CHECK: gpu.dealloc %[[d_global_data]] : memref<1x10x10xf32> + + // CHECK: return %[[h_alloc]] : memref<1x10x10xf32> + return %alloc_1 : memref<1x10x10xf32> } - ConvertMemcpyToGPUPass() = default; - ConvertMemcpyToGPUPass(const ConvertMemcpyToGPUPass &) {} - - Option processArgs{ - *this, "process-args", - llvm::cl::desc("Whether the pass processes the input args."), - llvm::cl::init(true)}; - - void runOnOperation() override; - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } -}; - -void ConvertMemcpyToGPUPass::runOnOperation() { - auto funcOp = getOperation(); - - if (funcOp.isDeclaration() || funcOp.isExternal()) - return; - - // Make sure the gpu function is already outlined. - funcOp->walk([&](Operation *nestedOp) { - if (auto gpuLaunchOp = dyn_cast(nestedOp)) { - nestedOp->emitOpError("The gpu function should be outlined."); + gpu.module @kernel { + gpu.func @fill(%arg0: index, %arg1: index, %arg2: f32, %arg3: memref<1x10x10xf32>) kernel attributes {gpu.known_block_size = array} { + gpu.return } - return WalkResult::advance(); - }); - - std::vector unDeallocatedValue; - OpBuilder builder(funcOp->getContext()); - - // Copy all function arguments to gpu, needs deallocation - if (processArgs) { - builder.setInsertionPointToStart(&(funcOp.getBody().front())); - unsigned numArgs = funcOp.getNumArguments(); - for (unsigned i = 0; i < numArgs; ++i) { - BlockArgument arg = funcOp.getArgument(i); - // Create a gpu.alloc op, then copy memory to it - // TODO: Move this out of operation, make the copy process async - auto memrefType = dyn_cast(arg.getType()); - auto gpuAllocOp = builder.create( - builder.getUnknownLoc(), TypeRange({memrefType}), ValueRange({})); - unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); - auto gpuMemcpyOp = builder.create( - gpuAllocOp.getLoc(), TypeRange(), ValueRange(), - gpuAllocOp.getResult(0), arg); - arg.replaceAllUsesExcept(gpuAllocOp->getResult(0), gpuMemcpyOp); + gpu.func @matmul(%arg0: index, %arg1: index, %arg2: memref<1x10x10xf32>, %arg3: memref<1x10x10xf32>, %arg4: memref<1x10x10xf32>, %arg5: index) kernel attributes {gpu.known_block_size = array} { + gpu.return } } - - funcOp->walk([&](Operation *nestedOp) { - // Replace all allocations with GPU.alloc - if (auto allocOp = dyn_cast(nestedOp)) { - // Rewrite this allocOp to gpu.alloc, change for all users - builder.setInsertionPointAfter(allocOp); - auto result = allocOp->getResult(0); - auto memrefType = dyn_cast(result.getType()); - auto memorySpace = memrefType.getMemorySpace(); - - // Filter operations. - if (memorySpace) { - if (auto intMemorySpace = llvm::dyn_cast(memorySpace)) { - if (intMemorySpace.getInt() != 0) { - return WalkResult::advance(); - } - } else if (auto gpuMemorySpace = - llvm::dyn_cast(memorySpace)) { - if (gpuMemorySpace.getValue() != gpu::AddressSpace::Global) { - return WalkResult::advance(); - } - } else - return WalkResult::advance(); - } - - auto gpuAllocOp = builder.create( - allocOp->getLoc(), TypeRange({memrefType}), ValueRange({})); - - for (auto user : llvm::make_early_inc_range(result.getUsers())) { - if (auto deallocOp = dyn_cast(user)) { - builder.setInsertionPointAfter(deallocOp); - builder.create(deallocOp->getLoc(), TypeRange(), - ValueRange(), gpuAllocOp.getResult(0)); - deallocOp->erase(); - } else { - for (auto &opOperand : user->getOpOperands()) { - if (opOperand.is(result)) { - opOperand.set(gpuAllocOp.getResult(0)); - } - } - } - } - allocOp->erase(); - } - // Replace all memory.copy operations with gpu.memcpy - else if (auto copyOp = dyn_cast(nestedOp)) { - auto src = copyOp.getOperand(0); - auto dst = copyOp.getOperand(1); - // Notice: GPU.memcpy has a different src dst order - builder.setInsertionPointAfter(copyOp); - auto gpuMemcpyOp = builder.create( - copyOp->getLoc(), TypeRange(), ValueRange(), dst, src); - src.replaceAllUsesWith(gpuMemcpyOp->getResult(1)); - dst.replaceAllUsesWith(gpuMemcpyOp->getResult(0)); - copyOp->erase(); - } - // Allocate space on GPU and copy global memrefs to GPU, needs deallocation - else if (auto getGlobalOp = dyn_cast(nestedOp)) { - builder.setInsertionPointAfter(getGlobalOp); - auto result = getGlobalOp->getResult(0); - auto memrefType = dyn_cast(result.getType()); - auto gpuAllocOp = builder.create( - getGlobalOp->getLoc(), TypeRange({memrefType}), ValueRange({})); - unDeallocatedValue.push_back(gpuAllocOp->getResult(0)); - - auto src = result; - auto dst = gpuAllocOp->getResult(0); - auto gpuMemcpyOp = builder.create( - gpuAllocOp->getLoc(), TypeRange(), ValueRange(), dst, src); - src.replaceAllUsesExcept(dst, gpuMemcpyOp); - } - // Copy data back to CPU, deallocate GPU, then return - else if (auto returnOp = dyn_cast(nestedOp)) { - builder.setInsertionPoint(returnOp); - for (unsigned i = 0; i < returnOp.getNumOperands(); ++i) { - auto val = returnOp->getOperand(i); - if (auto memrefType = dyn_cast(val.getType())) { - auto allocOp = - builder.create(returnOp->getLoc(), memrefType); - builder.create(allocOp.getLoc(), TypeRange(), - ValueRange(), allocOp->getResult(0), - val); - // FIXME: may be leak memory - // auto gpuDeallocOp = builder.create( - // gpuMemcpyOp->getLoc(), TypeRange(), ValueRange(), val); - returnOp->setOperand(i, allocOp->getResult(0)); - } - } - for (auto value : unDeallocatedValue) { - builder.create(returnOp->getLoc(), TypeRange(), - ValueRange(), value); - } - } - return WalkResult::advance(); - }); -} -} // end anonymous namespace. - -namespace mlir { -namespace buddy { -void registerConvertMemcpyToGPUPass() { - PassRegistration(); -} -} // namespace buddy -} // namespace mlir \ No newline at end of file +} \ No newline at end of file From 321927ca88028a77ab92e88a6a8169a3445d89bb Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Wed, 30 Oct 2024 02:45:37 +0000 Subject: [PATCH 26/29] Pass the test --- frontend/Python/frontend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py index 9d8c80f014..210815fb1e 100644 --- a/frontend/Python/frontend.py +++ b/frontend/Python/frontend.py @@ -45,6 +45,7 @@ from .graph import Graph, TensorDType, TensorMeta from .graph.operation import * from .graph.transform import maxpool2d_simplify +from .graph.type import * class DynamoCompiler: @@ -284,6 +285,7 @@ def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]): fake_params, self._ops_registry, self._func_name, + DeviceType.CPU, self._verbose ) for gm_node in _gm.graph.nodes: From 5d5a844c7b534429d07dab4c70e2b30dce243179 Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Wed, 30 Oct 2024 03:15:51 +0000 Subject: [PATCH 27/29] correct --- examples/BuddyLeNet/CMakeLists.txt | 23 ----- frontend/Python/graph/json_decoder.py | 94 ++++++++++++------- .../Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp | 2 +- tests/Conversion/convert-memcpy-to-gpu.mlir | 2 +- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt index 5935ad50c5..1902384f92 100644 --- a/examples/BuddyLeNet/CMakeLists.txt +++ b/examples/BuddyLeNet/CMakeLists.txt @@ -52,29 +52,6 @@ add_custom_command( set(ONE_SHOT_BUFFERIZE_OPTION "bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map") set(LOWER_TO_NVVM_OPTION "cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin") -# add_custom_command( -# OUTPUT subgraph0.o -# COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -# -one-shot-bufferize=${ONE_SHOT_BUFFERIZE_OPTION} -# -buffer-deallocation -# -convert-linalg-to-parallel-loops -# -canonicalize -# -gpu-map-parallel-loops -# -convert-parallel-loops-to-gpu -# -gpu-kernel-outlining -# -canonicalize -# -cse | -# ${BUDDY_BINARY_DIR}/buddy-opt -convert-memcpy-to-gpu -gpu-async-region -canonicalize | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-opt -llvm-request-c-wrappers --test-lower-to-nvvm=${LOWER_TO_NVVM_OPTION} | -# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | -# ${LLVM_TOOLS_BINARY_DIR}/llvm-as | -# ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o -# DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir -# COMMENT "Building subgraph0.o" -# VERBATIM) - add_custom_command( OUTPUT subgraph1.o COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph1.mlir diff --git a/frontend/Python/graph/json_decoder.py b/frontend/Python/graph/json_decoder.py index cfa825b0aa..f3a11440ac 100644 --- a/frontend/Python/graph/json_decoder.py +++ b/frontend/Python/graph/json_decoder.py @@ -1,3 +1,22 @@ +# ===- json_decoder.py --------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# This converts the JSON string representing Buddy Graph into a Graph object. +# +# ===--------------------------------------------------------------------------- import json from pathlib import Path @@ -11,6 +30,7 @@ from ..ops.math import ops_registry as math_ops_registry from ..ops.func import ops_registry as func_ops_registry + def json_to_graph(json_str): """ Converts a buddy graph JSON string to a Graph object. @@ -21,6 +41,7 @@ def json_to_graph(json_str): Returns: Graph: The Graph object created from the JSON data. """ + def json_to_tensormeta(json_data): """ Convert JSON data to a TensorMeta object. @@ -31,68 +52,77 @@ def json_to_tensormeta(json_data): Returns: TensorMeta: The TensorMeta object created from the JSON data. """ - if 'shape' in json_data: - shape = json_data['shape'] + if "shape" in json_data: + shape = json_data["shape"] dtype = next( - (member for member in TensorDType.__members__.values() - if member.value.upper() == json_data['dtype'].upper()), None + ( + member + for member in TensorDType.__members__.values() + if member.value.upper() == json_data["dtype"].upper() + ), + None, ) return TensorMeta(shape, dtype) return {} - + json_data = json.loads(json_str) _graph = json_data - graph_name = _graph['graph_name'] + graph_name = _graph["graph_name"] inputs = [] params = [] - for _input in _graph['inputs']: + for _input in _graph["inputs"]: inputs.append(json_to_tensormeta(_input)) - for _param in _graph['params']: + for _param in _graph["params"]: params.append(json_to_tensormeta(_param)) ops_registry = {} ops_registry.update(func_ops_registry) ops_registry.update(linalg_ops_registry) ops_registry.update(tosa_ops_registry) ops_registry.update(math_ops_registry) - graph = Graph( - inputs, - params, - ops_registry, - graph_name - ) - graph.device = _graph['device'] - for _node in _graph['nodes']: - op_class = _node['class'] + graph = Graph(inputs, params, ops_registry, graph_name) + graph.device = _graph["device"] + for _node in _graph["nodes"]: + op_class = _node["class"] op = globals()[op_class]() - op._name = _node['name'] - op._children = _node['children'] - op._parents = _node['parents'] - op._arguments = _node['arguments'] - op._keyword_arguments = _node['keyword_arguments'] + op._name = _node["name"] + op._children = _node["children"] + op._parents = _node["parents"] + op._arguments = _node["arguments"] + op._keyword_arguments = _node["keyword_arguments"] op._type = next( - (member for member in OpType.__members__.values() if member.value == _node['type']), None + ( + member + for member in OpType.__members__.values() + if member.value == _node["type"] + ), + None, ) # TODO : node attr tensor_meta should be Class TensorMeta - if ('shape' not in _node['tensor_meta']): - op._tensor_meta = _node['tensor_meta'] + if "shape" not in _node["tensor_meta"]: + op._tensor_meta = _node["tensor_meta"] else: op._tensor_meta = { - 'shape' : _node['tensor_meta']['shape'], - 'dtype' : next( - (member for member in TensorDType.__members__.values() - if member.value.upper() == _node['tensor_meta']['dtype'].upper()), None - ) + "shape": _node["tensor_meta"]["shape"], + "dtype": next( + ( + member + for member in TensorDType.__members__.values() + if member.value.upper() + == _node["tensor_meta"]["dtype"].upper() + ), + None, + ), } graph.add_node(op) - for i, device in enumerate(list(set(_graph['node_map_device'].values()))): + for i, device in enumerate(list(set(_graph["node_map_device"].values()))): subgraph_name = "subgraph{}".format(i) graph.op_groups[subgraph_name] = [] graph.group_map_device[subgraph_name] = DeviceType(device) - for node, op_device in _graph['node_map_device'].items(): + for node, op_device in _graph["node_map_device"].items(): op = graph.node_table[node] for subgraph_name, group_device in graph.group_map_device.items(): if op_device == group_device.value: diff --git a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp index f616127930..e44f21cb6e 100644 --- a/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp +++ b/midend/lib/Conversion/MLIRGPU/ConvertMemcpyToGPU.cpp @@ -211,4 +211,4 @@ void registerConvertMemcpyToGPUPass() { PassRegistration(); } } // namespace buddy -} // namespace mlir \ No newline at end of file +} // namespace mlir diff --git a/tests/Conversion/convert-memcpy-to-gpu.mlir b/tests/Conversion/convert-memcpy-to-gpu.mlir index 573000a4b5..65e9301e4a 100644 --- a/tests/Conversion/convert-memcpy-to-gpu.mlir +++ b/tests/Conversion/convert-memcpy-to-gpu.mlir @@ -66,4 +66,4 @@ module attributes {gpu.container_module} { gpu.return } } -} \ No newline at end of file +} From 69c4262f6dd1fa7ccca5d765e71d7edb1892f35c Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Wed, 30 Oct 2024 03:43:37 +0000 Subject: [PATCH 28/29] temp --- examples/BuddyLeNet/buddy-lenet-import.py | 2 +- frontend/Python/graph/transform/__init__.py | 2 +- frontend/Python/graph/transform/fuse_ops.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index aec4e5e561..d6bcd30b8a 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -27,7 +27,7 @@ from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.graph import GraphDriver -from buddy.compiler.graph.transform import cpu_fuse, gpu_fuse, custom_partition +from buddy.compiler.graph.transform import simply_fuse, gpu_fuse, custom_partition from buddy.compiler.graph.type import DeviceType from buddy.compiler.ops import tosa, gpu from buddy.compiler.graph.json_decoder import json_to_graph diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py index 427d266b95..95428b3367 100644 --- a/frontend/Python/graph/transform/__init__.py +++ b/frontend/Python/graph/transform/__init__.py @@ -18,5 +18,5 @@ # # ===--------------------------------------------------------------------------- -from .fuse_ops import cpu_fuse, gpu_fuse, custom_partition +from .fuse_ops import simply_fuse, gpu_fuse, custom_partition from .useless_op_eliminate import maxpool2d_simplify diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py index e0ff806f52..7bfd2e8f98 100644 --- a/frontend/Python/graph/transform/fuse_ops.py +++ b/frontend/Python/graph/transform/fuse_ops.py @@ -29,7 +29,7 @@ # ANCHOR_OP_TYPE = [] -def cpu_fuse(graph: Graph): +def simply_fuse(graph: Graph): """ Function to fuse all operations into one graph. Set the device type to CPU. From bf7ca39b8b99737cae7b9d47d488011c5363d3b8 Mon Sep 17 00:00:00 2001 From: WuXintong123 <13683168028@163.com> Date: Wed, 30 Oct 2024 05:34:56 +0000 Subject: [PATCH 29/29] final --- examples/BuddyLeNet/buddy-lenet-import.py | 17 ++++++----------- frontend/Python/graph/graph.py | 5 +---- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/examples/BuddyLeNet/buddy-lenet-import.py b/examples/BuddyLeNet/buddy-lenet-import.py index d6bcd30b8a..2ef14649e6 100644 --- a/examples/BuddyLeNet/buddy-lenet-import.py +++ b/examples/BuddyLeNet/buddy-lenet-import.py @@ -27,7 +27,11 @@ from buddy.compiler.frontend import DynamoCompiler from buddy.compiler.graph import GraphDriver -from buddy.compiler.graph.transform import simply_fuse, gpu_fuse, custom_partition +from buddy.compiler.graph.transform import ( + simply_fuse, + gpu_fuse, + custom_partition, +) from buddy.compiler.graph.type import DeviceType from buddy.compiler.ops import tosa, gpu from buddy.compiler.graph.json_decoder import json_to_graph @@ -61,15 +65,7 @@ pattern_list = [custom_partition] graph.fuse_ops(pattern_list) path_prefix = os.path.dirname(os.path.abspath(__file__)) - -# Convert the lenet graph to JSON string -json_str = graph.to_json() -with open(os.path.join(path_prefix, "lenet.json"), "w") as module_file: - module_file.write(json_str) - -# Convert the lenet graph Json string to a lenet graph -graph0 = json_to_graph(json_str) -driver = GraphDriver(graph0) +driver = GraphDriver(graph) driver.subgraphs[0].lower_to_top_level_ir() with open(os.path.join(path_prefix, "subgraph0.mlir"), "w") as module_file: print(driver.subgraphs[0]._imported_module, file=module_file) @@ -88,4 +84,3 @@ ) float32_param.tofile(Path(current_path) / "arg0.data") - diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py index 3283beacee..ddf50f697c 100644 --- a/frontend/Python/graph/graph.py +++ b/frontend/Python/graph/graph.py @@ -198,13 +198,10 @@ def fuse_ops(self, pattern_list: List[FunctionType]): # TODO: discuss two fuse strategy # 1. fuse ops adapt for DSA(hardware dependent) # 2. common fuse strategy(hardware independent) - - # Initialize operation groups - self.init_op_group() - # Apply fusion patterns for pattern_func in pattern_list: pattern_func(self) + # Initialize operation groups def perform(self, func_list: List[FunctionType]): """