diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02ca0c2..ab2566f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,12 +44,19 @@ set(APFP_PORT_MAPPING MatrixMultiplication_1.m_axi_a:DDR[1]
                       MatrixMultiplication_1.m_axi_b:DDR[1]
                       MatrixMultiplication_1.m_axi_c_read:DDR[1]
                       MatrixMultiplication_1.m_axi_c_write:DDR[1])
+set(APFP_CONNECTIVITY MatrixMultiplication_1.a_to_kernel:FreeRunningMultiplication_1.a_to_kernel
+                      MatrixMultiplication_1.b_to_kernel:FreeRunningMultiplication_1.b_to_kernel
+                      FreeRunningMultiplication_1.ab_from_kernel:MatrixMultiplication_1.ab_from_kernel)
 if(${APFP_COMPUTE_UNITS} GREATER 1)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
                           MatrixMultiplication_2.m_axi_a:DDR[0]
                           MatrixMultiplication_2.m_axi_b:DDR[0]
                           MatrixMultiplication_2.m_axi_c_read:DDR[0]
                           MatrixMultiplication_2.m_axi_c_write:DDR[0])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_2.a_to_kernel:FreeRunningMultiplication_2.a_to_kernel
+                          MatrixMultiplication_2.b_to_kernel:FreeRunningMultiplication_2.b_to_kernel
+                          FreeRunningMultiplication_2.ab_from_kernel:MatrixMultiplication_2.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 2)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
@@ -57,6 +64,10 @@ if(${APFP_COMPUTE_UNITS} GREATER 2)
                           MatrixMultiplication_3.m_axi_b:DDR[2]
                           MatrixMultiplication_3.m_axi_c_read:DDR[2]
                           MatrixMultiplication_3.m_axi_c_write:DDR[2])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_3.a_to_kernel:FreeRunningMultiplication_3.a_to_kernel
+                          MatrixMultiplication_3.b_to_kernel:FreeRunningMultiplication_3.b_to_kernel
+                          FreeRunningMultiplication_3.ab_from_kernel:MatrixMultiplication_3.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 3)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
@@ -64,6 +75,10 @@ if(${APFP_COMPUTE_UNITS} GREATER 3)
                           MatrixMultiplication_4.m_axi_b:DDR[3]
                           MatrixMultiplication_4.m_axi_c_read:DDR[3]
                           MatrixMultiplication_4.m_axi_c_write:DDR[3])
+    set(APFP_CONNECTIVITY ${APFP_CONNECTIVITY}
+                          MatrixMultiplication_4.a_to_kernel:FreeRunningMultiplication_4.a_to_kernel
+                          MatrixMultiplication_4.b_to_kernel:FreeRunningMultiplication_4.b_to_kernel
+                          FreeRunningMultiplication_4.ab_from_kernel:MatrixMultiplication_4.ab_from_kernel)
 endif()
 if(${APFP_COMPUTE_UNITS} GREATER 4)
     set(APFP_PORT_MAPPING ${APFP_PORT_MAPPING}
@@ -98,20 +113,32 @@ if(${APFP_COMPUTE_UNITS} GREATER 8)
 endif()
 
 # Setup FPGA kernel targets
+set(APFP_HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
+set(APFP_HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16")
+set(APFP_INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR})
+set(APFP_DEPENDS ${CMAKE_BINARY_DIR}/Config.h
+                 include/ArithmeticOperations.h
+                 include/DeviceTypes.h
+                 include/Karatsuba.h
+                 include/MatrixMultiplication.h
+                 include/PackedFloat.h
+                 include/PipelinedAdd.h)
 add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
                  COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
-                 INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}
-                 HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS"
-                 HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16"
-                 DEPENDS ${CMAKE_BINARY_DIR}/Config.h
-                         include/ArithmeticOperations.h
-                         include/DeviceTypes.h
-                         include/Karatsuba.h
-                         include/MatrixMultiplication.h
-                         include/PackedFloat.h
-                         include/PipelinedAdd.h
+                 INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
+                 HLS_FLAGS ${APFP_HLS_FLAGS}
+                 HLS_CONFIG ${APFP_HLS_CONFIG}
+                 DEPENDS ${APFP_DEPENDS}
                 PORT_MAPPING ${APFP_PORT_MAPPING})
+add_vitis_kernel(FreeRunningMultiplication FILES ${APFP_KERNEL_FILES}
+                 COMPUTE_UNITS ${APFP_COMPUTE_UNITS}
+                 INCLUDE_DIRS ${APFP_INCLUDE_DIRS}
+                 HLS_FLAGS ${APFP_HLS_FLAGS}
+                 HLS_CONFIG ${APFP_HLS_CONFIG}
+                 DEPENDS ${APFP_DEPENDS})
 add_vitis_program(MatrixMultiplication ${APFP_PLATFORM}
+                  KERNELS MatrixMultiplication FreeRunningMultiplication
+                  CONNECTIVITY ${APFP_CONNECTIVITY}
                   PROFILING ${APFP_PROFILING}
                   DEBUGGING ${APFP_DEBUGGING}
                   SAVE_TEMPS ${APFP_SAVE_TEMPS})
diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp
index 594cf69..5e9c00e 100644
--- a/device/MatrixMultiplication.cpp
+++ b/device/MatrixMultiplication.cpp
@@ -330,42 +330,66 @@ void WriteC(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, const
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in, hlslib::Stream<PackedFloat> &c_in,
-             hlslib::Stream<PackedFloat> &c_out, int const size_n, int const size_k, int const size_m) {
-    PackedFloat a_buffer;  // Just to make A symmetric to B and C
+void ComputeEntry(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in,
+                  hlslib::Stream<PackedFloat> &a_out, hlslib::Stream<PackedFloat> &b_out, int const size_n,
+                  int const size_k, int const size_m) {
+    PackedFloat a_buffer;
     PackedFloat b_buffer[kTileSizeM];
-    PackedFloat c_buffer[kTileSizeN * kTileSizeM];
     const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
     const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
-Compute_TilesN:
+ComputeEntry_TilesN:
     for (int n0 = 0; n0 < tiles_n; ++n0) {
-    Compute_TilesM:
+    ComputeEntry_TilesM:
         for (int m0 = 0; m0 < tiles_m; ++m0) {
-        Compute_K:
+        ComputeEntry_K:
             for (int k = 0; k < size_k; ++k) {
-            Compute_N:
+            ComputeEntry_N:
                 for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) {
-                Compute_M:
+                ComputeEntry_M:
                     for (int m1 = 0; m1 < kTileSizeM; ++m1) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
                         const PackedFloat a_read = a_in.Pop();
                         const PackedFloat b_read = b_in.Pop();
-                        const PackedFloat c_read = c_in.Pop();
                         const PackedFloat a = (m1 == 0) ? a_read : a_buffer;
                         const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1];
-                        const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
                         a_buffer = a;
                         b_buffer[m1] = b;
                         // Ignore contributions from out-of-bound indices
                         const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
-                        // Meat of the computation
-                        const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(),
-                                                            in_bounds ? b : PackedFloat::Zero(), c);
-                        // Write back to buffer
+                        a_out.Push(in_bounds ? a : PackedFloat::Zero());
+                        b_out.Push(in_bounds ? b : PackedFloat::Zero());
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ComputeExit(hlslib::Stream<PackedFloat> &ab_in, hlslib::Stream<PackedFloat> &c_in,
+                 hlslib::Stream<PackedFloat> &c_out, int const size_n, int const size_k, int const size_m) {
+    PackedFloat c_buffer[kTileSizeN * kTileSizeM];
+    const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN);
+    const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM);
+ComputeExit_TilesN:
+    for (int n0 = 0; n0 < tiles_n; ++n0) {
+    ComputeExit_TilesM:
+        for (int m0 = 0; m0 < tiles_m; ++m0) {
+        ComputeExit_K:
+            for (int k = 0; k < size_k; ++k) {
+            ComputeExit_N:
+                for (int n1 = 0; n1 < ((n0 < tiles_n - 1) ? kTileSizeN : (size_n - n0 * kTileSizeN)); ++n1) {
+                ComputeExit_M:
+                    for (int m1 = 0; m1 < kTileSizeM; ++m1) {
+#pragma HLS PIPELINE II = 1
+#pragma HLS LOOP_FLATTEN
+                        const PackedFloat ab = ab_in.Pop();
+                        const PackedFloat c_read = c_in.Pop();
+                        const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
+                        const PackedFloat res = Add(ab, c);
+                        c_out.Push(res);
                         c_buffer[n1 * kTileSizeM + m1] = res;
 #pragma HLS DEPENDENCE variable = c_buffer false
-                        c_out.Push(res);
                     }
                 }
             }
@@ -375,8 +399,22 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void FreeRunningMultiplication(hlslib::Stream<PackedFloat> &a_to_kernel, hlslib::Stream<PackedFloat> &b_to_kernel,
+                               hlslib::Stream<PackedFloat> &ab_from_kernel) {
+#pragma HLS INTERFACE axis port = a_to_kernel
+#pragma HLS INTERFACE axis port = b_to_kernel
+#pragma HLS INTERFACE axis port = ab_from_kernel
+#pragma HLS interface ap_ctrl_none port = return
+#pragma HLS PIPELINE II = 1
+    ab_from_kernel.Push(Multiply(a_to_kernel.Pop(), b_to_kernel.Pop()));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 void MatrixMultiplication(DramLine const *const a, DramLine const *const b, DramLine const *const c_read,
-                          DramLine *const c_write, const int size_n, const int size_k, int const size_m) {
+                          DramLine *const c_write, const int size_n, const int size_k, int const size_m,
+                          hlslib::Stream<PackedFloat> &a_to_kernel, hlslib::Stream<PackedFloat> &b_to_kernel,
+                          hlslib::Stream<PackedFloat> &ab_from_kernel) {
 #pragma HLS INTERFACE m_axi offset = slave port = a bundle = a
 #pragma HLS INTERFACE m_axi offset = slave port = b bundle = b
 // Even though they actually point to the same memory location, we use two separate interfaces for reading and writing
@@ -390,6 +428,9 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram
 #pragma HLS INTERFACE s_axilite port = size_n
 #pragma HLS INTERFACE s_axilite port = size_k
 #pragma HLS INTERFACE s_axilite port = size_m
+#pragma HLS INTERFACE axis port = a_to_kernel
+#pragma HLS INTERFACE axis port = b_to_kernel
+#pragma HLS INTERFACE axis port = ab_from_kernel
 #pragma HLS STABLE variable = a
 #pragma HLS STABLE variable = b
 #pragma HLS STABLE variable = c_read
@@ -399,21 +440,23 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram
 #pragma HLS STABLE variable = size_m
 #pragma HLS DATAFLOW
     hlslib::Stream<PackedFloat, 16> a_to_feeder("a_to_feeder");
-    hlslib::Stream<PackedFloat, 16> a_to_kernel("a_to_kernel");
+    hlslib::Stream<PackedFloat, 16> a_to_entry("a_to_entry");
     hlslib::Stream<PackedFloat, 16> b_to_feeder("b_to_feeder");
-    hlslib::Stream<PackedFloat, 16> b_to_kernel("b_to_kernel");
+    hlslib::Stream<PackedFloat, 16> b_to_entry("b_to_entry");
     hlslib::Stream<PackedFloat, 16> c_to_feeder("c_to_feeder");
     hlslib::Stream<PackedFloat, 16> c_to_kernel("c_to_kernel");
     hlslib::Stream<PackedFloat, 16> c_from_kernel("c_from_kernel");
+    hlslib::Stream<PackedFloat, 16> c_from_exit("c_from_exit");
     hlslib::Stream<PackedFloat, 16> c_from_drainer("c_from_drainer");
     HLSLIB_DATAFLOW_INIT();
     HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_feeder, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_entry, size_n, size_k, size_m);
     HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_feeder, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_entry, size_n, size_k, size_m);
     HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_feeder, size_n, size_m);
     HLSLIB_DATAFLOW_FUNCTION(FeedC, c_to_feeder, c_to_kernel, size_n, size_k, size_m);
-    HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(ComputeEntry, a_to_entry, b_to_entry, a_to_kernel, b_to_kernel, size_n, size_k, size_m);
+    HLSLIB_DATAFLOW_FUNCTION(ComputeExit, ab_from_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m);
     HLSLIB_DATAFLOW_FUNCTION(DrainC, c_from_kernel, c_from_drainer, size_n, size_k, size_m);
     HLSLIB_DATAFLOW_FUNCTION(WriteC, c_from_drainer, c_write, size_n, size_m);
     HLSLIB_DATAFLOW_FINALIZE();
diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp
index 4655a58..271b329 100644
--- a/host/TestProgram.cpp
+++ b/host/TestProgram.cpp
@@ -1,4 +1,5 @@
 #include <hlslib/xilinx/OpenCL.h>
+#include <hlslib/xilinx/Stream.h>
 #include <hlslib/xilinx/Utility.h>
 
 #include <cstdlib>  // putenv
@@ -22,6 +23,13 @@ struct MpfrWrapper {
 };
 
 #ifdef HLSLIB_SIMULATE_OPENCL
+void RunFreeRunningKernel(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_in,
+                          hlslib::Stream<PackedFloat> &ab_out) {
+    while (true) {
+        FreeRunningMultiplication(a_in, b_in, ab_out);
+    }
+}
+
 bool RunTestSimulation(int size_n, int size_k, int size_m, bool verify) {
     const std::string kernel_path("");
 #else
@@ -111,9 +119,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
     // In simulation mode, this will call the function "MatrixMultiplication" and run it in software.
     // Otherwise, the provided path to a kernel binary will be loaded and executed.
     std::vector<hlslib::ocl::Kernel> kernels;
+    hlslib::Stream<PackedFloat> a_to_kernel[kComputeUnits];
+    hlslib::Stream<PackedFloat> b_to_kernel[kComputeUnits];
+    hlslib::Stream<PackedFloat> ab_from_kernel[kComputeUnits];
     for (int i = 0; i < kComputeUnits; ++i) {
-        kernels.emplace_back(program.MakeKernel(MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i],
-                                                c_device[i], c_device[i], n_partition_size[i], size_k, size_m));
+        kernels.emplace_back(program.MakeKernel(
+            MatrixMultiplication, "MatrixMultiplication", a_device[i], b_device[i], c_device[i], c_device[i],
+            n_partition_size[i], size_k, size_m, hlslib::ocl::SimulationOnly(a_to_kernel[i]),
+            hlslib::ocl::SimulationOnly(b_to_kernel[i]), hlslib::ocl::SimulationOnly(ab_from_kernel[i])));
     }
 
     const float expected_runtime = expected_cycles / 0.3e9;
@@ -126,6 +139,14 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
               << bandwidth << " GB/s.\n";
 
     std::cout << "Executing kernel...\n";
+#ifdef HLSLIB_SIMULATE_OPENCL
+    for (int i = 0; i < kComputeUnits; ++i) {
+        std::thread free_running(RunFreeRunningKernel, std::ref(a_to_kernel[i]), std::ref(b_to_kernel[i]),
+                                 std::ref(ab_from_kernel[i]));
+        // Will be killed when the program exits
+        free_running.detach();
+    }
+#endif
     std::vector<hlslib::ocl::Event> events;
     auto start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < kComputeUnits; ++i) {
diff --git a/include/MatrixMultiplication.h b/include/MatrixMultiplication.h
index 43198fe..d2ced76 100644
--- a/include/MatrixMultiplication.h
+++ b/include/MatrixMultiplication.h
@@ -1,7 +1,16 @@
 #pragma once
 
+#include <hlslib/xilinx/Stream.h>
+
 #include "Config.h"
 #include "DeviceTypes.h"
+#include "PackedFloat.h"
 
 extern "C" void MatrixMultiplication(DramLine const *a, DramLine const *b, DramLine const *c_read, DramLine *c_write,
-                                     int n, int m, int k);
+                                     int n, int m, int k, hlslib::Stream<PackedFloat> &a_to_kernel,
+                                     hlslib::Stream<PackedFloat> &b_to_kernel,
+                                     hlslib::Stream<PackedFloat> &ab_from_kernel);
+
+extern "C" void FreeRunningMultiplication(hlslib::Stream<PackedFloat> &a_to_kernel,
+                                          hlslib::Stream<PackedFloat> &b_to_kernel,
+                                          hlslib::Stream<PackedFloat> &ab_from_kernel);