diff --git a/examples/batchMatMulOpt/.gitignore b/examples/batchMatMulOpt/.gitignore
new file mode 100644
index 0000000000..80a243fa81
--- /dev/null
+++ b/examples/batchMatMulOpt/.gitignore
@@ -0,0 +1 @@
+log.*
diff --git a/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir b/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir
new file mode 100644
index 0000000000..42640a3f32
--- /dev/null
+++ b/examples/batchMatMulOpt/linalg-batch-matmul-f32-1.mlir
@@ -0,0 +1,51 @@
+
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+func.func @main(){
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c576 = arith.constant 576 : index
+  %c1024 = arith.constant 1024 : index
+  %c1000 = arith.constant 1000 : index
+  %f2 = arith.constant 2.0 : f32
+  %f3 = arith.constant 3.0 : f32
+  %f0 = arith.constant 0.0 : f32
+
+  %a = memref.alloc() : memref<1x1x576xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c1 step %c1 {
+      scf.for %arg2 = %c0 to %c576 step %c1 {
+        memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x576xf32>
+      }
+    }
+  }
+
+  %b = memref.alloc() : memref<1x576x1024xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c576 step %c1 {
+      scf.for %arg2 = %c0 to %c1024 step %c1 {
+        memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x576x1024xf32>
+      }
+    }
+  }
+
+  %c = memref.alloc() : memref<1x1x1024xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c1 step %c1 {
+      scf.for %arg2 = %c0 to %c1000 step %c1 {
+        memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1024xf32>
+      }
+    }
+  }
+
+  linalg.batch_matmul 
+    ins(%a, %b : memref<1x1x576xf32>, memref<1x576x1024xf32>) 
+    outs(%c : memref<1x1x1024xf32>)
+  
+
+  %printed_c = memref.cast %c : memref<1x1x1024xf32> to memref<*xf32>
+  call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
+
+  return
+}
diff --git a/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir b/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir
new file mode 100644
index 0000000000..9c58ca7ff2
--- /dev/null
+++ b/examples/batchMatMulOpt/linalg-batch-matmul-f32-2.mlir
@@ -0,0 +1,73 @@
+// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm \
+// RUN:   -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-loops -convert-scf-to-cf -llvm-request-c-wrappers \
+// RUN:   -convert-func-to-llvm -reconcile-unrealized-casts %s \
+// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_f32 -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0) -> (d0 mod 64)>
+#map1 = affine_map<(d0) -> (d0 floordiv 64)>
+#map2 = affine_map<(d0) -> (d0)>
+#map3 = affine_map<(d0) -> (d0 * 64)>
+#set = affine_set<(d0)[s0] : (d0 * -64 + s0 - 64 >= 0)>
+
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+func.func @conv_2d(%arg0: memref<1x1x1024xf32>, %arg1: memref<1x1024x1000xf32>, %arg2: memref<1x1x1000xf32>) {
+  linalg.batch_matmul 
+    ins(%arg0, %arg1 : memref<1x1x1024xf32>, memref<1x1024x1000xf32>) 
+    outs(%arg2 : memref<1x1x1000xf32>)
+  return
+}
+
+func.func @main(){
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c576 = arith.constant 576 : index
+  %c1024 = arith.constant 1024 : index
+  %c1000 = arith.constant 1000 : index
+  %f0 = arith.constant 0.0 : f32
+  %f2 = arith.constant 2.0 : f32
+  %f3 = arith.constant 3.0 : f32
+
+  %a = memref.alloc() : memref<1x1x1024xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c1 step %c1 {
+      scf.for %arg2 = %c0 to %c1024 step %c1 {
+        memref.store %f3, %a[%arg0, %arg1, %arg2] : memref<1x1x1024xf32>
+      }
+    }
+  }
+
+  %b = memref.alloc() : memref<1x1024x1000xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c1024 step %c1 {
+      scf.for %arg2 = %c0 to %c1000 step %c1 {
+        memref.store %f2, %b[%arg0, %arg1, %arg2] : memref<1x1024x1000xf32>
+      }
+    }
+  }
+
+  %c = memref.alloc() : memref<1x1x1000xf32>
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    scf.for %arg1 = %c0 to %c1 step %c1 {
+      scf.for %arg2 = %c0 to %c1000 step %c1 {
+        memref.store %f0, %c[%arg0, %arg1, %arg2] : memref<1x1x1000xf32>
+      }
+    }
+  }
+  
+  linalg.batch_matmul 
+    ins(%a, %b : memref<1x1x1024xf32>, memref<1x1024x1000xf32>) 
+    outs(%c : memref<1x1x1000xf32>)
+
+  %printed_c = memref.cast %c : memref<1x1x1000xf32> to memref<*xf32>
+  call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
+  // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 3 offset = 0 sizes = \[2, 2, 4\] strides = \[8, 4, 1\] data =}}
+  // CHECK{LITERAL}: [[[98,    226,    292,    164], 
+  // CHECK{LITERAL}:   [12,    76,    96,    56]], 
+  // CHECK{LITERAL}:  [[48,    162,    72,    156], 
+  // CHECK{LITERAL}:   [16,    112,    0,    104]]]
+  return
+}
diff --git a/examples/batchMatMulOpt/makefile b/examples/batchMatMulOpt/makefile
new file mode 100644
index 0000000000..68bb28a5b3
--- /dev/null
+++ b/examples/batchMatMulOpt/makefile
@@ -0,0 +1,171 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+OPT_FLAG := -O0
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+batch-matmul-1-lower:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-convert-linalg-to-affine-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-llvm-request-c-wrappers \
+		-convert-vector-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-expand-strided-metadata \
+		-finalize-memref-to-llvm \
+		-reconcile-unrealized-casts \
+		-o ./log.mlir
+
+
+batch-matmul-1-translate:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-convert-linalg-to-affine-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-llvm-request-c-wrappers \
+		-convert-vector-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-expand-strided-metadata \
+		-finalize-memref-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+batch-matmul-1-run:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-convert-linalg-to-affine-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-llvm-request-c-wrappers \
+		-convert-vector-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-expand-strided-metadata \
+		-finalize-memref-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+batch-matmul-2-lower:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-o ./log.mlir
+
+
+batch-matmul-2-translate:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-convert-linalg-to-affine-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-llvm-request-c-wrappers \
+		-convert-vector-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-expand-strided-metadata \
+		-finalize-memref-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+batch-matmul-2-run:
+	@${MLIR_OPT} ./linalg-batch-matmul-f32-2.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" | \
+	${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-func-bufferize-dynamic-offset \
+		-arith-bufferize \
+		-func-bufferize \
+		-tensor-bufferize \
+		-linalg-bufferize \
+		-finalizing-bufferize \
+		-batchmatmul-optimize \
+		-convert-linalg-to-affine-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-llvm-request-c-wrappers \
+		-convert-vector-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-expand-strided-metadata \
+		-finalize-memref-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
\ No newline at end of file
diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
index 757ac8ae91..864c30c850 100644
--- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
+++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
@@ -105,7 +105,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern {
 
     // Apply the column of matrix B.
     Value appliedColOfB = rewriter.create<affine::AffineApplyOp>(
-        loc, AffineMap::get(1, 0, d0.ceilDiv(affineVectorSize)),
+        loc, AffineMap::get(1, 0, d0.floorDiv(affineVectorSize)),
         ValueRange{bCol});
 
     // Create the primary parallel batch level loop.