diff --git a/CMakeLists.txt b/CMakeLists.txt
index c54aee8..f742b42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,10 @@ set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in generated kernels.")
 set(APFP_SAVE_TEMPS OFF CACHE BOOL "Save temporary files from kernel builds.")
 set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR)
 
+# One day we might accept both
+set(APFP_INTERFACE_TYPE ${APFP_SEMANTICS})
+# but not today
+
 # Validation and derived numbers
 math(EXPR APFP_ALIGNED "${APFP_BITS} % 512")
 if(NOT APFP_ALIGNED EQUAL 0)
@@ -30,7 +34,7 @@ find_package(GMP REQUIRED)
 find_package(Threads REQUIRED)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -Wno-unknown-pragmas -Wno-class-memaccess -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
-include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} )
+include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} interface)
 
 configure_file(include/Config.h.in Config.h)
 
@@ -40,7 +44,7 @@ set(APFP_KERNEL_FILES device/MatrixMultiplication.cpp
 
 # Setup FPGA kernel targets
 add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
-                 INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR}
+                 INCLUDE_DIRS include hlslib/include ${CMAKE_BINARY_DIR} ${GMP_INCLUDES}
                  HLS_FLAGS "-DAP_INT_MAX_W=${APFP_MAX_BITS} -DAPFP_${APFP_SEMANTICS}_SEMANTICS"
                  HLS_CONFIG "config_compile -pipeline_style frp\nconfig_dataflow -fifo_depth 16"
                  DEPENDS ${CMAKE_BINARY_DIR}/Config.h
@@ -66,9 +70,9 @@ add_library(simulation ${APFP_KERNEL_FILES})
 target_compile_options(simulation PRIVATE -Wno-unknown-pragmas -DAP_INT_MAX_W=${APFP_MAX_BITS})
 target_link_libraries(simulation ${CMAKE_THREAD_LIBS_INIT})
 
-add_library(ApfpHostlib SHARED interface/Apfp.cpp)
-target_link_libraries(ApfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) 
-target_compile_definitions(ApfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL)
+add_library(apfpHostlib SHARED interface/Apfp.cpp interface/ApfpBlas.cpp interface/ApfpInterfaceType.cpp)
+target_link_libraries(apfpHostlib ${Vitis_LIBRARIES} ${GMP_LIBRARIES}) 
+target_compile_definitions(apfpHostlib PRIVATE HLSLIB_SIMULATE_OPENCL)
 
 # Executable used to run in simulation mode, calling the kernel as a C++ function directly
 add_executable(TestSimulation host/TestProgram.cpp)
@@ -84,7 +88,20 @@ enable_testing()
 add_test(TestSimulation TestSimulation 4 4 4)
 add_library(Catch host/Catch.cpp)
 add_executable(UnitTests host/UnitTests.cpp)
-target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp simulation)
+target_link_libraries(UnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation)
 add_test(UnitTests UnitTests)
 
-install(TARGETS ApfpHostlib)
+add_executable(BlasUnitTests host/BlasUnitTests.cpp)
+target_link_libraries(BlasUnitTests Catch ${GMP_LIBRARIES} ${MPFR_LIBRARIES} apfp apfpHostlib simulation)
+
+install(TARGETS apfpHostlib)
+install(FILES
+    interface/Apfp.h
+    interface/ApfpBlas.h
+    interface/ApfpInterfaceType.h
+    ${CMAKE_BINARY_DIR}/Config.h
+    DESTINATION include/apfp)
+install(FILES
+    ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw.xclbin
+    ${CMAKE_BINARY_DIR}/MatrixMultiplication_hw_emu.xclbin
+    DESTINATION lib)
diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp
index ec53647..a7aa725 100644
--- a/device/MatrixMultiplication.cpp
+++ b/device/MatrixMultiplication.cpp
@@ -6,10 +6,15 @@
 
 #include "ArithmeticOperations.h"
 
+// All memory accesses are column-major!
+// I.e. a(i,j) = a[i + LDA * j]
+// AB = sum_k a(i,k) b(k, j) = sum_k a[i + LDA * k] * b[k + LDA * j]
+// LDA (leading dimension of A) = stride
+
 // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not,
 // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case.
 template <int lines_per_number>
-void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
+void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0,
                 const int k) {
 #pragma HLS INLINE
     DramLine num[kLinesPerNumber];
@@ -19,7 +24,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
-            num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i];
+            num[i] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
                 a_to_feeder.Push(PackedFloat(num));
             }
@@ -28,7 +33,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee
 }
 
 template <>
-void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0,
+void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0,
                    const int k) {
 #pragma HLS INLINE
 ReadA_N:
@@ -36,7 +41,7 @@ void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         DramLine num[1];
-        num[0] = mem[(n0 * kTileSizeN + n1) * size_k + k];
+        num[0] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber];
         a_to_feeder.Push(PackedFloat(num));
     }
 }
@@ -51,7 +56,7 @@ void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder,
         for (int m0 = 0; m0 < tiles_m; ++m0) {
         ReadA_K:
             for (int k = 0; k < size_k; ++k) {
-                ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_k, n0, k);
+                ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_n, n0, k);
             }
         }
     }
@@ -90,7 +95,7 @@ void FeedA(hlslib::Stream<PackedFloat> &a_to_feeder, hlslib::Stream<PackedFloat>
 ////////////////////////////////////////////////////////////////////////////////
 
 template <int lines_per_number>
-void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
+void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0,
                 const int k) {
 #pragma HLS INLINE
     DramLine num[kLinesPerNumber];
@@ -100,7 +105,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
-            num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
+            num[i] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
                 b_to_feeder.Push(PackedFloat(num));
             }
@@ -109,7 +114,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee
 }
 
 template <>
-void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0,
+void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0,
                    const int k) {
 #pragma HLS INLINE
 ReadB_M:
@@ -117,7 +122,7 @@ void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         DramLine num[1];
-        num[0] = mem[k * size_m + m0 * kTileSizeM + m1];
+        num[0] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber];
         b_to_feeder.Push(PackedFloat(num));
     }
 }
@@ -132,7 +137,7 @@ void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder,
         for (int m0 = 0; m0 < tiles_m; ++m0) {
         ReadB_K:
             for (int k = 0; k < size_k; ++k) {
-                ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_m, m0, k);
+                ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_k, m0, k);
             }
         }
     }
@@ -169,7 +174,7 @@ void FeedB(hlslib::Stream<PackedFloat> &b_to_feeder, hlslib::Stream<PackedFloat>
 ////////////////////////////////////////////////////////////////////////////////
 
 template <int lines_per_number>
-void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
+void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0,
                 const int m0, const int n1) {
 #pragma HLS INLINE
 ReadC_M:
@@ -179,7 +184,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee
         for (int i = 0; i < kLinesPerNumber; ++i) {
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
-            num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i];
+            num[i] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i];
             if (i == kLinesPerNumber - 1) {
                 c_to_feeder.Push(PackedFloat(num));
             }
@@ -188,7 +193,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee
 }
 
 template <>
-void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0,
+void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0,
                    const int m0, const int n1) {
 #pragma HLS INLINE
 ReadC_M:
@@ -196,7 +201,7 @@ void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_
 #pragma HLS PIPELINE II = 1
 #pragma HLS LOOP_FLATTEN
         DramLine num[1];
-        num[0] = mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1];
+        num[0] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber];
         c_to_feeder.Push(PackedFloat(num));
     }
 }
@@ -210,7 +215,7 @@ void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder,
         for (int m0 = 0; m0 < tiles_m; ++m0) {
         ReadC_N:
             for (int n1 = 0; n1 < kTileSizeN; ++n1) {
-                ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_m, n0, m0, n1);
+                ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_n, n0, m0, n1);
             }
         }
     }
@@ -290,7 +295,7 @@ void WriteCInner(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem,
             }
             const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
             if (in_bounds) {
-                mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i] = num[i];
+                mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i] = num[i];
             }
         }
     }
@@ -308,7 +313,7 @@ void WriteCInner<1>(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const me
         from_kernel.Pop().UnpackFlits(num);
         const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m);
         if (in_bounds) {
-            mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1] = num[0];
+            mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber] = num[0];
         }
     }
 }
@@ -354,7 +359,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
                         const PackedFloat c_read = c_in.Pop();
                         const PackedFloat a = (m1 == 0) ? a_read : a_buffer;
                         const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1];
-                        const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1];
+                        const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 + m1 * kTileSizeN];
                         a_buffer = a;
                         b_buffer[m1] = b;
                         // Ignore contributions from out-of-bound indices
@@ -363,7 +368,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i
                         const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(),
                                                             in_bounds ? b : PackedFloat::Zero(), c);
                         // Write back to buffer
-                        c_buffer[n1 * kTileSizeM + m1] = res;
+                        c_buffer[n1 + m1 * kTileSizeN] = res;
                         c_out.Push(res);
                     }
                 }
diff --git a/host/BlasUnitTests.cpp b/host/BlasUnitTests.cpp
new file mode 100644
index 0000000..420925a
--- /dev/null
+++ b/host/BlasUnitTests.cpp
@@ -0,0 +1,250 @@
+#include <catch.hpp>
+#include <iostream>
+#include <limits>
+
+#include "Config.h"
+
+// #include "ArithmeticOperations.h"
+// #include "Karatsuba.h"
+// #include "PackedFloat.h"
+#include "ApfpBlas.h"
+#include "Random.h"
+
+void ApfpSetup() {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_set_default_prec(kMantissaBits);
+#else
+    mpfr_set_default_prec(kMantissaBits);
+#endif
+    auto apfp_error_code = apfp::Init(kMantissaBits);
+    REQUIRE(!apfp_error_code);
+}
+
+void ApfpTeardown() {
+    apfp::Finalize();
+}
+
+bool IsZero(apfp::interface::ConstPtr a) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    return mpf_sgn(a) == 0;
+#else
+    return mpfr_sgn(a) == 0;
+#endif
+}
+
+bool IsClose(apfp::interface::ConstPtr a, apfp::interface::ConstPtr b) {
+    // Avoids divide by zero if a = b = 0
+    if (IsZero(a) && IsZero(b)) {
+        return true;
+    }
+
+    apfp::interface::Wrapper diff, sum, ratio;
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_sub(diff.get(), a, b);
+    mpf_add(sum.get(), a, b);
+    mpf_div(ratio.get(), diff.get(), sum.get());
+    long exp;
+    mpf_get_d_2exp(&exp, ratio.get());
+#else
+    mpfr_sub(diff.get(), a, b, kRoundingMode);
+    mpfr_add(sum.get(), a, b, kRoundingMode);
+    mpfr_div(ratio.get(), diff.get(), sum.get(), kRoundingMode);
+    auto exp = mpfr_get_exp(ratio.get());
+#endif
+    // Require the numbers to match to the first 90% decimal places
+    // Also demand double precision downcast exactly equal
+    return (exp < -((kMantissaBits * 3 * 9) / 10)) && (mpfr_get_d(a, kRoundingMode) == mpfr_get_d(b, kRoundingMode));
+}
+
+TEST_CASE("Init_Teardown") {
+    ApfpSetup();
+    ApfpTeardown();
+}
+
+TEST_CASE("SYRK") {
+    ApfpSetup();
+
+    auto rng = RandomNumberGenerator();
+
+    unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33);
+    unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33);
+
+    auto mode = GENERATE(apfp::BlasTrans::transpose, apfp::BlasTrans::normal);
+    auto uplo_mode = GENERATE(apfp::BlasUplo::upper, apfp::BlasUplo::lower);
+    // Test SYRK
+    // In 'N' mode, we perform AA^T + C
+    // A is NxK (A : R^K -> R^N)
+    // C is NxN
+    // Matrices are stored column major because BLAS
+    {
+        CAPTURE(N, K, mode, uplo_mode);
+        std::vector<apfp::interface::Wrapper> a_matrix;
+        a_matrix.resize(N * K);
+        for (auto& v : a_matrix) {
+            rng.Generate(v.get());
+        }
+
+        std::vector<apfp::interface::Wrapper> c_matrix;
+        c_matrix.resize(N * N);
+        for (auto& v : c_matrix) {
+            rng.Generate(v.get());
+        }
+
+        std::vector<apfp::interface::Wrapper> ref_result;
+        ref_result.resize(N * N);
+
+        // Capture inputs for when we explode
+        std::vector<double> a_matrix_d, c_matrix_d;
+        a_matrix_d.resize(a_matrix.size());
+        std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        c_matrix_d.resize(c_matrix.size());
+        std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        CAPTURE(a_matrix_d, c_matrix_d);
+
+        // Compute reference result
+        apfp::interface::Wrapper prod_temp;
+        for (unsigned long j = 0; j < N; ++j) {
+            // lower half
+            for (unsigned long i = 0; i < N; ++i) {
+                auto r_idx = mode == apfp::BlasTrans::normal ? i + j * N : j + i * N;
+                apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get());
+
+                for (unsigned long k = 0; k < K; ++k) {
+                    // (AB)_ij = sum_k A(i,k)B(k,j)
+                    apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * N).get(), a_matrix.at(j + k * N).get());
+                    apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get());
+                }
+            }
+        }
+
+        // Use APFP BLAS library
+        auto error_code = apfp::Syrk(
+            uplo_mode, mode, N, K, [&](unsigned long i) { return a_matrix.at(i).get(); },
+            mode == apfp::BlasTrans::normal ? N : K, [&](unsigned long i) { return c_matrix.at(i).get(); }, N);
+        REQUIRE(!error_code);
+
+        std::vector<double> c_matrix_result_d, c_matrix_ref_result_d;
+        c_matrix_result_d.resize(c_matrix.size());
+        c_matrix_ref_result_d.resize(c_matrix.size());
+        std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+
+        CAPTURE(c_matrix_result_d, c_matrix_ref_result_d);
+
+        // Check all entries are sufficiently close
+        apfp::interface::Wrapper diff;
+        for (unsigned long j = 0; j < N; ++j) {
+            // upper half
+            for (unsigned long i = 0; i < j; ++i) {
+                auto ref_value = uplo_mode == apfp::BlasUplo::upper ? ref_result.at(i + j * N).get()
+                                                                    : ref_result.at(j + i * N).get();
+                auto test_value =
+                    uplo_mode == apfp::BlasUplo::upper ? c_matrix.at(i + j * N).get() : c_matrix.at(j + i * N).get();
+                CAPTURE(i, j);
+                CAPTURE(PackedFloat(ref_value), PackedFloat(test_value));
+                CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode));
+                REQUIRE(IsClose(ref_value, test_value));
+            }
+        }
+    }
+
+    ApfpTeardown();
+}
+
+TEST_CASE("GEMM") {
+    ApfpSetup();
+
+    auto rng = RandomNumberGenerator();
+
+    unsigned long M = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33);
+    unsigned long N = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33);
+    unsigned long K = GENERATE(0, 1, 2, 8, 15, 16, 31, 32, 33);
+
+    {
+        CAPTURE(M, N, K);
+        std::vector<apfp::interface::Wrapper> a_matrix;
+        a_matrix.resize(M * K);
+        for (auto& v : a_matrix) {
+            rng.Generate(v.get());
+        }
+
+        std::vector<apfp::interface::Wrapper> b_matrix;
+        b_matrix.resize(K * N);
+        for (auto& v : b_matrix) {
+            rng.Generate(v.get());
+        }
+
+        std::vector<apfp::interface::Wrapper> c_matrix;
+        c_matrix.resize(M * N);
+        for (auto& v : a_matrix) {
+            rng.Generate(v.get());
+        }
+
+        std::vector<apfp::interface::Wrapper> ref_result;
+        ref_result.resize(M * N);
+
+        // Capture inputs for when we explode
+        std::vector<double> a_matrix_d, c_matrix_d;
+        a_matrix_d.resize(a_matrix.size());
+        std::transform(a_matrix.begin(), a_matrix.end(), a_matrix_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        c_matrix_d.resize(c_matrix.size());
+        std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        CAPTURE(a_matrix_d, c_matrix_d);
+
+        // Compute reference result
+        apfp::interface::Wrapper prod_temp;
+        for (unsigned long j = 0; j < N; ++j) {
+            // lower half
+            for (unsigned long i = 0; i < M; ++i) {
+                auto r_idx = i + j * M;
+                apfp::interface::Set(ref_result.at(r_idx).get(), c_matrix.at(r_idx).get());
+
+                for (unsigned long k = 0; k < K; ++k) {
+                    // (AB)_ij = sum_k A(i,k)B(k,j)
+                    apfp::interface::Mul(prod_temp.get(), a_matrix.at(i + k * M).get(), b_matrix.at(k + j * K).get());
+                    apfp::interface::Add(ref_result.at(r_idx).get(), prod_temp.get(), ref_result.at(r_idx).get());
+                }
+            }
+        }
+
+        // Use APFP BLAS library
+        auto error_code = apfp::Gemm(
+            apfp::BlasTrans::normal, apfp::BlasTrans::normal, M, N, K,
+            [&](unsigned long i) { return a_matrix.at(i).get(); }, M,
+            [&](unsigned long i) { return b_matrix.at(i).get(); }, K,
+            [&](unsigned long i) { return c_matrix.at(i).get(); }, M);
+        REQUIRE(!error_code);
+
+        std::vector<double> c_matrix_result_d, c_matrix_ref_result_d;
+        c_matrix_result_d.resize(c_matrix.size());
+        c_matrix_ref_result_d.resize(c_matrix.size());
+        std::transform(c_matrix.begin(), c_matrix.end(), c_matrix_result_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+        std::transform(ref_result.begin(), ref_result.end(), c_matrix_ref_result_d.begin(),
+                       [](const auto& v) { return mpfr_get_d(v.get(), kRoundingMode); });
+
+        CAPTURE(c_matrix_result_d, c_matrix_ref_result_d);
+
+        // Check all entries are sufficiently close
+        apfp::interface::Wrapper diff;
+        for (unsigned long j = 0; j < N; ++j) {
+            // upper half
+            for (unsigned long i = 0; i < M; ++i) {
+                auto ref_value = ref_result.at(i + j * M).get();
+                auto test_value = c_matrix.at(i + j * M).get();
+                CAPTURE(i, j);
+                CAPTURE(PackedFloat(ref_value), PackedFloat(test_value));
+                CAPTURE(mpfr_get_d(ref_value, kRoundingMode), mpfr_get_d(test_value, kRoundingMode));
+                REQUIRE(IsClose(ref_value, test_value));
+            }
+        }
+    }
+
+    ApfpTeardown();
+}
\ No newline at end of file
diff --git a/host/MatrixMultiplicationReference.cpp b/host/MatrixMultiplicationReference.cpp
index 42d2b98..3c0d180 100644
--- a/host/MatrixMultiplicationReference.cpp
+++ b/host/MatrixMultiplicationReference.cpp
@@ -8,8 +8,9 @@ void MatrixMultiplicationReference(mpfr_t const *a, mpfr_t const *b, mpfr_t *c,
     for (int n = 0; n < size_n; ++n) {
         for (int k = 0; k < size_k; ++k) {
             for (int m = 0; m < size_m; ++m) {
-                mpfr_mul(tmp, a[n * size_k + k], b[k * size_m + m], kRoundingMode);
-                mpfr_t &_c = c[n * size_m + m];
+                // C(n, m) = sum_k A(n, k) B(k, m)
+                mpfr_mul(tmp, a[n + k * size_n], b[k + m * size_k], kRoundingMode);
+                mpfr_t &_c = c[n + m * size_n];
                 mpfr_add(_c, _c, tmp, kRoundingMode);
             }
         }
diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp
index b3312d7..c38f300 100644
--- a/host/TestProgram.cpp
+++ b/host/TestProgram.cpp
@@ -114,8 +114,8 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
     // Verify results
     for (int n = 0; n < size_n; ++n) {
         for (int m = 0; m < size_m; ++m) {
-            const PackedFloat res = c_host[n * size_m + m];
-            const PackedFloat ref(c_mpfr[n * size_m + m]);
+            const PackedFloat res = c_host[n + m * size_n];
+            const PackedFloat ref(c_mpfr[n + m * size_n]);
             if (ref != res) {
                 std::cerr << "Verification failed at (" << n << ", " << m << "):\n\t" << res << "\n\t" << ref << "\n";
                 return false;
@@ -127,17 +127,17 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m,
     // Clean up
     for (int n = 0; n < size_n; ++n) {
         for (int k = 0; k < size_k; ++k) {
-            mpfr_clear(a_mpfr[n * size_k + k]);
+            mpfr_clear(a_mpfr[n + k * size_n]);
         }
     }
     for (int k = 0; k < size_k; ++k) {
         for (int m = 0; m < size_m; ++m) {
-            mpfr_clear(b_mpfr[k * size_m + m]);
+            mpfr_clear(b_mpfr[k + m * size_k]);
         }
     }
     for (int n = 0; n < size_n; ++n) {
         for (int m = 0; m < size_m; ++m) {
-            mpfr_clear(c_mpfr[n * size_m + m]);
+            mpfr_clear(c_mpfr[n + m * size_n]);
         }
     }
 
diff --git a/include/Config.h.in b/include/Config.h.in
index e9e0ab1..6b7a358 100644
--- a/include/Config.h.in
+++ b/include/Config.h.in
@@ -6,4 +6,7 @@ constexpr int kMultBaseBits = ${APFP_MULT_BASE_BITS};
 constexpr int kTileSizeN = ${APFP_TILE_SIZE_N};
 constexpr int kTileSizeM = ${APFP_TILE_SIZE_M};
 constexpr auto kBuildDir = "${CMAKE_BINARY_DIR}";
+constexpr auto kInstallPrefix = "${CMAKE_INSTALL_PREFIX}";
 static_assert(kBits % 8 == 0, "Number of bits must be byte-aligned.");
+
+#define APFP_${APFP_INTERFACE_TYPE}_INTERFACE_TYPE
diff --git a/include/PackedFloat.h b/include/PackedFloat.h
index 0673fdf..d3e7290 100644
--- a/include/PackedFloat.h
+++ b/include/PackedFloat.h
@@ -150,7 +150,7 @@ class PackedFloat {
         return *this;
     }
 
-    inline void ToGmp(mpf_ptr num) {
+    inline void ToGmp(mpf_ptr num) const {
         const size_t gmp_limbs = (mpf_get_prec(num) + 8 * sizeof(mp_limb_t) - 1) / (8 * sizeof(mp_limb_t));
         constexpr size_t kNumLimbs = kMantissaBytes / sizeof(Limb);
         // GMP does not allow graceful rounding, so we cannot handle having insufficient bits in the target GMP number
diff --git a/interface/Apfp.cpp b/interface/Apfp.cpp
index 9c9cd07..e88585e 100644
--- a/interface/Apfp.cpp
+++ b/interface/Apfp.cpp
@@ -2,16 +2,59 @@
 
 #include <MatrixMultiplication.h>
 
+#include <cstdlib>
+#include <filesystem>
 #include <stdexcept>
+#include <string>
 
 #include "Config.h"
 
-Apfp::Apfp() {
-    program_.emplace(context_.MakeProgram(kernel_path_));
+namespace apfp {
+
+Context::Context() {
+    auto kernel_path = FindKernel();
+    program_.emplace(context_.MakeProgram(kernel_path));
     lines_per_number_ = kLinesPerNumber;
 }
 
-DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) {
+std::string Context::FindKernel() {
+    {  // Specify a path to the APFP kernel manually
+        char* apfp_kernel_env_var = std::getenv("APFP_KERNEL");
+        if (apfp_kernel_env_var != nullptr) {
+            auto kernel_override_path = std::filesystem::path(apfp_kernel_env_var);
+
+            if (!std::filesystem::exists(kernel_override_path)) {
+                throw KernelNotFoundException(
+                    "APFP kernel path specified with APFP_KERNEL environment variable does not exist");
+            }
+            return kernel_override_path.string();
+        }
+    }
+
+    char* apfp_use_simulation_env_var = std::getenv("APFP_USE_SIMULATION");
+    auto apfp_use_simulation =
+        apfp_use_simulation_env_var != nullptr && !std::string(apfp_use_simulation_env_var).empty();
+    auto kernel_name = std::filesystem::path(apfp_use_simulation ? "MatrixMultiplication_hw_emu.xclbin"
+                                                                 : "MatrixMultiplication_hw.xclbin");
+
+    {
+        std::vector<std::filesystem::path> search_paths;
+        search_paths.push_back(std::filesystem::current_path());
+        search_paths.push_back(std::filesystem::path(kInstallPrefix));
+
+        // Search
+        for (auto candidate_dir : search_paths) {
+            auto candidate_kernel_path = candidate_dir / kernel_name;
+            if (std::filesystem::exists(candidate_kernel_path)) {
+                return candidate_kernel_path.string();
+            }
+        }
+    }
+
+    throw KernelNotFoundException("Unable to find FPGA kernel");
+}
+
+DeviceMatrix Context::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) {
     // This seems like poor encapsulation, is there a better way?
 
     DeviceMatrix matrix;
@@ -21,13 +64,13 @@ DeviceMatrix Apfp::AllocateDeviceMatrix(std::size_t rows, std::size_t cols) {
     return matrix;
 }
 
-DeviceMatrix Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) {
+DeviceMatrix Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b) {
     auto result = AllocateDeviceMatrix(a.rows(), b.cols());
     MatrixMultiplication(a, b, &result);
     return result;
 }
 
-void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) {
+void Context::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result) {
     if (a.cols() != b.rows() || result->rows() != a.rows() || result->cols() != b.cols()) {
         throw std::logic_error("Matrix dimension mismatch");
     }
@@ -37,15 +80,20 @@ void Apfp::MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, De
     kernel.ExecuteTask();
 }
 
-void Apfp::TransposeInPlace(DeviceMatrix*) {
-    throw std::exception();
+void Context::MatrixAddition(const DeviceMatrix&, const DeviceMatrix&, DeviceMatrix*) {
+    throw UnimplementedException();
 }
 
-DeviceMatrix Apfp::Transpose(const DeviceMatrix&) {
-    throw std::exception();
+void Context::TransposeInPlace(DeviceMatrix*) {
+    throw UnimplementedException();
 }
 
-void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size) {
+DeviceMatrix Context::Transpose(const DeviceMatrix&) {
+    throw UnimplementedException();
+}
+
+template <typename ptr_function_type>
+void DeviceMatrix::TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) {
     if (rows() * cols() > buffer_size) {
         throw std::runtime_error("Source host buffer size smaller than destination device matrix size");
     }
@@ -54,17 +102,53 @@ void DeviceMatrix::TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_
     std::vector<PackedFloat> host_buffer;
     host_buffer.resize(cols() * rows());
 
-    std::transform(buffer_ptr, buffer_ptr + host_buffer.size(), host_buffer.begin(),
-                   [](const mpf_t& a) { return PackedFloat(a); });
+    for (std::size_t i = 0; i < host_buffer.size(); ++i) {
+        host_buffer[i] = PackedFloat(buffer_ptr_func(i));
+    }
 
     buffer_.CopyFromHost(0, host_buffer.size() * kLinesPerNumber,
                          reinterpret_cast<DramLine const*>(host_buffer.data()));
 }
 
-void DeviceMatrix::TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size) {
-    if (rows() * cols() >= buffer_size) {
+void DeviceMatrix::TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size) {
+    TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr + i; }, buffer_size);
+}
+
+void DeviceMatrix::TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size) {
+    TransferToDeviceImpl([&](std::size_t i) { return buffer_ptr[i].get(); }, buffer_size);
+}
+
+void PackedFloatToInterfaceType(const PackedFloat& packed, mpfr_ptr dest) {
+    packed.ToMpfr(dest);
+}
+
+void PackedFloatToInterfaceType(const PackedFloat& packed, mpf_ptr dest) {
+    packed.ToGmp(dest);
+}
+
+template <typename ptr_function_type>
+void DeviceMatrix::TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size) {
+    if (rows() * cols() > buffer_size) {
         throw std::runtime_error("Destination host buffer size smaller than source device matrix size");
     }
 
-    buffer_.CopyToHost(0, kLinesPerNumber * rows() * cols(), reinterpret_cast<DramLine*>(buffer_ptr));
+    std::vector<PackedFloat> host_buffer;
+    host_buffer.resize(cols() * rows());
+
+    buffer_.CopyToHost(0, kLinesPerNumber * host_buffer.size(), reinterpret_cast<DramLine*>(host_buffer.data()));
+
+    interface::Wrapper scratch;
+    for (std::size_t i = 0; i < host_buffer.size(); ++i) {
+        PackedFloatToInterfaceType(host_buffer[i], buffer_ptr_func(i));
+    }
+}
+
+void DeviceMatrix::TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size) {
+    TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr + i; }, buffer_size);
 }
+
+void DeviceMatrix::TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size) {
+    TransferToHostImpl([&](std::size_t i) -> interface::Ptr { return buffer_ptr[i].get(); }, buffer_size);
+}
+
+}  // namespace apfp
\ No newline at end of file
diff --git a/interface/Apfp.h b/interface/Apfp.h
index f17d39e..6be5555 100644
--- a/interface/Apfp.h
+++ b/interface/Apfp.h
@@ -2,23 +2,28 @@
 #include <gmp.h>
 #include <hlslib/xilinx/OpenCL.h>
 
+#include <functional>
 #include <optional>
 
+#include "ApfpInterfaceType.h"
 #include "MatrixMultiplication.h"
 #include "PackedFloat.h"
 
+namespace apfp {
+
 class DeviceMatrix;
 
 /// Object oriented interface for Apfp
-class Apfp {
+class Context {
     hlslib::ocl::Context context_;
     std::optional<hlslib::ocl::Program> program_;
 
     std::size_t lines_per_number_;
-    const std::string kernel_path_ = "";
+
+    static std::string FindKernel();
 
    public:
-    Apfp();
+    Context();
 
     /// Allocate a buffer on the device
     DeviceMatrix AllocateDeviceMatrix(std::size_t rows, std::size_t cols);
@@ -29,6 +34,9 @@ class Apfp {
     /// Three argument matrix multiply with supplied output buffer
     void MatrixMultiplication(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result);
 
+    /// Three argument matrix addition with supplied output buffer
+    void MatrixAddition(const DeviceMatrix& a, const DeviceMatrix& b, DeviceMatrix* result);
+
     // Transpose a matrix in place
     void TransposeInPlace(DeviceMatrix* a);
 
@@ -37,13 +45,13 @@ class Apfp {
 };
 
 /// Helper class to track matrices on the device
-/// We should probably refactor the interface to Apfp to something more controlled?
+/// We should probably refactor the interface to Context to something more controlled?
 class DeviceMatrix {
     std::size_t num_rows_;
     std::size_t num_cols_;
     hlslib::ocl::Buffer<DramLine, hlslib::ocl::Access::readWrite> buffer_;
 
-    friend Apfp;
+    friend Context;
 
     DeviceMatrix() = default;
 
@@ -58,9 +66,45 @@ class DeviceMatrix {
 
     /// Transfer from the host to the device
     /// TODO: Make this take input iterators
-    void TransferToDevice(const mpf_t* buffer_ptr, std::size_t buffer_size);
+    void TransferToDevice(interface::ConstPtr buffer_ptr, std::size_t buffer_size);
+    void TransferToDevice(const interface::Wrapper* buffer_ptr, std::size_t buffer_size);
 
     /// Transfer from the device to the host
     /// TODO: Make this take output iterators
-    void TransferToHost(mpf_t* buffer_ptr, std::size_t buffer_size);
+    void TransferToHost(interface::Ptr buffer_ptr, std::size_t buffer_size);
+    void TransferToHost(interface::Wrapper* buffer_ptr, std::size_t buffer_size);
+
+   private:
+    template <typename ptr_function_type>
+    void TransferToDeviceImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size);
+
+    template <typename ptr_function_type>
+    void TransferToHostImpl(ptr_function_type buffer_ptr_func, std::size_t buffer_size);
+};
+
+// === Custom exception types ===
+struct ApfpException : public std::exception {
+    std::string e;
+
+    ApfpException() {
+        e = "";
+    }
+
+    ApfpException(const std::string& what_arg) {
+        e = what_arg;
+    }
+
+    virtual const char* what() const noexcept {
+        return e.c_str();
+    }
 };
+
+struct KernelNotFoundException : public ApfpException {
+    using ApfpException::ApfpException;
+};
+
+struct UnimplementedException : public ApfpException {
+    using ApfpException::ApfpException;
+};
+
+}  // namespace apfp
\ No newline at end of file
diff --git a/interface/ApfpBlas.cpp b/interface/ApfpBlas.cpp
new file mode 100644
index 0000000..0e757d5
--- /dev/null
+++ b/interface/ApfpBlas.cpp
@@ -0,0 +1,276 @@
+#include "ApfpBlas.h"
+
+#include <optional>
+#include <stdexcept>
+
+#include "Apfp.h"
+#include "Config.h"
+
+namespace apfp {
+
+static std::optional<Context> apfp;
+static std::string last_error_message;
+
+int Init(unsigned long precision) {
+    try {
+        if (precision > kBits) {
+            // Requested bit width too large
+            last_error_message = "Requested bitwidth too large";
+            return static_cast<int>(BlasError::bitwidth);
+        }
+        apfp.emplace();
+        return static_cast<int>(BlasError::success);
+
+    } catch (const KernelNotFoundException& e) {
+        last_error_message = e.what();
+        return static_cast<int>(BlasError::kernel_not_found);
+
+    } catch (const std::exception& e) {
+        // Unknown exception
+        last_error_message = e.what();
+        return static_cast<int>(BlasError::unknown);
+    }
+}
+
+int Finalize() {
+    apfp.reset();
+    return static_cast<int>(BlasError::success);
+}
+
+bool IsInitialized() {
+    return apfp.has_value();
+}
+
+const char* ErrorDescription() {
+    return last_error_message.c_str();
+}
+
+BlasError InterpretError(int a) {
+    return a < 0 ? BlasError::argument_error : static_cast<BlasError>(a);
+}
+
+/// Copy the upper or lower triangle from an NxN matrix A to a full size buffer
+template <typename ptr_function_type>
+void CopyFromMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA,
+                        interface::Wrapper* buffer) {
+    auto dest_lda = N;
+    // Col major layout
+    for (unsigned long j = 0; j < N; ++j) {
+        for (unsigned long i = 0; i <= j; ++i) {
+            auto source = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA);
+            interface::Set(buffer[i + j * dest_lda].get(), source);
+            interface::Set(buffer[j + i * dest_lda].get(), source);
+        }
+    }
+}
+
+/// Copy from a full size buffer to the upper or lower triangle of an NxN matrix A
+template <typename ptr_function_type>
+void CopyToMatrixUplo(BlasUplo uplo, unsigned long N, ptr_function_type A, unsigned long LDA,
+                      interface::Wrapper* buffer) {
+    auto source_lda = N;
+    // Col major layout
+    for (unsigned long j = 0; j < N; ++j) {
+        for (unsigned long i = 0; i <= j; ++i) {
+            auto dest = uplo == BlasUplo::lower ? A(i + j * LDA) : A(j + i * LDA);
+            interface::Set(dest, buffer[i + j * source_lda].get());
+        }
+    }
+}
+
+/// Copy from an NxK matrix A to a full size buffer
+template <typename ptr_function_type>
+void CopyFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA,
+                    interface::Wrapper* buffer) {
+    auto dest_lda = N;
+    // Col major layout
+    for (unsigned long j = 0; j < K; ++j) {
+        for (unsigned long i = 0; i < N; ++i) {
+            interface::Set(buffer[i + j * dest_lda].get(), A(i + j * LDA));
+        }
+    }
+}
+
+/// Copy the transpose of a NxK matrix A to a full size buffer
+template <typename ptr_function_type>
+void CopyTransposeFromMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA,
+                             interface::Wrapper* buffer) {
+    auto dest_lda = K;
+    // Col major layout
+    for (unsigned long j = 0; j < K; ++j) {
+        for (unsigned long i = 0; i < N; ++i) {
+            interface::Set(buffer[i * dest_lda + j].get(), A(i + j * LDA));
+        }
+    }
+}
+
+/// Copy to an NxK matrix A from a full size buffer
+template <typename ptr_function_type>
+void CopyToMatrix(unsigned long N, unsigned long K, ptr_function_type A, unsigned long LDA,
+                  interface::Wrapper* buffer) {
+    auto source_lda = N;
+    // Col major layout
+    for (unsigned long j = 0; j < K; ++j) {
+        for (unsigned long i = 0; i < N; ++i) {
+            interface::Set(A(i + j * LDA), buffer[i + j * source_lda].get());
+        }
+    }
+}
+
+/// Do all the intermediate work to get a device matrix out of a pointer function
+/// TODO: we can make this handle the tranpose argument and LDA check (raise exception or option type)
+template <typename ptr_function_type>
+DeviceMatrix MakeDeviceMatrix(unsigned long N, unsigned long M, ptr_function_type A, unsigned long LDA) {
+    std::vector<interface::Wrapper> host_a;
+    host_a.resize(N * M);
+    CopyFromMatrix(N, M, A, LDA, host_a.data());
+    auto device_a = apfp->AllocateDeviceMatrix(N, M);
+    device_a.TransferToDevice(host_a.data(), host_a.size());
+    return device_a;
+}
+
+template <typename ptr_function_type_a, typename ptr_function_type_c>
+int SyrkImpl(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ptr_function_type_a A, unsigned long LDA,
+             ptr_function_type_c C, unsigned long LDC) {
+    try {
+        // ==== library input validation stuff ====
+        if (!IsInitialized()) {
+            return static_cast<int>(BlasError::uninitialized);
+        }
+
+        // A is NxK if 'N', KxN if 'T'
+        // C is always NxN
+        // N mode
+        // A A^T + C
+        // T mode
+        // A^T A + C
+        bool use_transpose = trans == BlasTrans::transpose;
+
+        unsigned long A_rows = use_transpose ? K : N;
+        unsigned long A_cols = use_transpose ? N : K;
+
+        if (LDA < (use_transpose ? K : N)) {
+            return -6;
+        }
+        if (LDC < N) {
+            return -8;
+        }
+
+        // Empty matrix no-op
+        if (N == 0) {
+            return static_cast<int>(BlasError::success);
+        }
+        if (K == 0) {
+            return static_cast<int>(BlasError::success);
+        }
+
+        // ==== setup ====
+        std::vector<interface::Wrapper> host_a, host_a_transpose, host_c;
+        host_a.resize(N * K);
+        CopyFromMatrix(A_rows, A_cols, A, LDA, host_a.data());
+        auto device_a = apfp->AllocateDeviceMatrix(A_rows, A_cols);
+        device_a.TransferToDevice(host_a.data(), host_a.size());
+
+        host_a_transpose.resize(K * N);
+        CopyTransposeFromMatrix(A_rows, A_cols, A, LDA, host_a_transpose.data());
+        auto device_a_transpose = apfp->AllocateDeviceMatrix(A_cols, A_rows);
+        device_a_transpose.TransferToDevice(host_a_transpose.data(), host_a_transpose.size());
+
+        host_c.resize(N * N);
+        CopyFromMatrixUplo(uplo, N, C, LDC, host_c.data());
+        auto device_c = apfp->AllocateDeviceMatrix(N, N);
+        device_c.TransferToDevice(host_c.data(), host_c.size());
+
+        // ==== compute and teardown ====
+        auto mul_result = apfp->AllocateDeviceMatrix(N, N);
+        if (use_transpose) {
+            apfp->MatrixMultiplication(device_a_transpose, device_a, &device_c);
+        } else {
+            apfp->MatrixMultiplication(device_a, device_a_transpose, &device_c);
+        }
+
+        device_c.TransferToHost(host_c.data(), host_c.size());
+        CopyToMatrixUplo(uplo, N, C, LDC, host_c.data());
+    } catch (const std::exception& e) {
+        last_error_message = e.what();
+        return static_cast<int>(BlasError::unknown);
+    }
+
+    return static_cast<int>(BlasError::success);
+}
+
+/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported
+int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA,
+         interface::Ptr C, unsigned long LDC) {
+    auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; };
+    auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; };
+    return SyrkImpl(uplo, trans, N, K, a_ptr_function, LDA, c_ptr_function, LDC);
+}
+
+int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA,
+         IndexFunction C, unsigned long LDC) {
+    return SyrkImpl(uplo, trans, N, K, A, LDA, C, LDC);
+}
+
+template <typename ptr_function_type_a, typename ptr_function_type_b, typename ptr_function_type_c>
+int GemmImpl(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K,
+             ptr_function_type_a A, unsigned long LDA, ptr_function_type_b B, unsigned long LDB, ptr_function_type_c C,
+             unsigned long LDC) {
+    try {
+        // ==== library input validation stuff ====
+        if (!IsInitialized()) {
+            return static_cast<int>(BlasError::uninitialized);
+        }
+
+        // Implement the transposed versions later
+        if (trans_a != BlasTrans::normal || trans_b != BlasTrans::normal) {
+            return static_cast<int>(BlasError::unimplemented);
+        }
+
+        // Empty matrix
+        if (N == 0 || M == 0 || K == 0) {
+            return static_cast<int>(BlasError::success);
+        }
+
+        // Validate leading dimensions are sane
+        if (LDA < M) {
+            return -7;
+        }
+        if (LDB < K) {
+            return -9;
+        }
+
+        // ==== setup ====
+        auto device_a = MakeDeviceMatrix(M, K, A, LDA);
+        auto device_b = MakeDeviceMatrix(K, N, B, LDB);
+        auto device_c = MakeDeviceMatrix(M, N, C, LDC);
+
+        // ==== compute and teardown ====
+        apfp->MatrixMultiplication(device_a, device_b, &device_c);
+        std::vector<interface::Wrapper> host_c;
+        host_c.resize(M * N);
+        device_c.TransferToHost(host_c.data(), host_c.size());
+        CopyToMatrix(M, N, C, LDC, host_c.data());
+
+    } catch (const std::exception& e) {
+        last_error_message = e.what();
+        return static_cast<int>(BlasError::unknown);
+    }
+
+    return static_cast<int>(BlasError::success);
+}
+/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported
+int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A,
+         unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC) {
+    auto a_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return A + i; };
+    auto b_ptr_function = [&](unsigned long i) -> interface::ConstPtr { return B + i; };
+    auto c_ptr_function = [&](unsigned long i) -> interface::Ptr { return C + i; };
+    return GemmImpl(trans_a, trans_b, M, N, K, a_ptr_function, LDA, b_ptr_function, LDB, c_ptr_function, LDC);
+}
+
+int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A,
+         unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC) {
+    return GemmImpl(trans_a, trans_b, M, N, K, A, LDA, B, LDB, C, LDC);
+}
+
+}  // namespace apfp
\ No newline at end of file
diff --git a/interface/ApfpBlas.h b/interface/ApfpBlas.h
new file mode 100644
index 0000000..89d7593
--- /dev/null
+++ b/interface/ApfpBlas.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <gmp.h>
+
+#include <functional>
+
+#include "ApfpInterfaceType.h"
+
+namespace apfp {
+
+enum class BlasError : int {
+    success = 0,
+    unknown = 1,
+    unimplemented = 2,
+    bitwidth = 3,
+    uninitialized = 4,
+    kernel_not_found = 5,
+    argument_error = 6,
+};
+
+enum class BlasUplo : char { upper = 'U', lower = 'L' };
+
+enum class BlasTrans : char {
+    normal = 'N',
+    transpose = 'T',
+};
+
+using IndexFunction = std::function<interface::Ptr(unsigned long)>;
+using ConstIndexFunction = std::function<interface::ConstPtr(unsigned long)>;
+/// Null terminated string describing the most recent library error if available
+/// Pointer is only guaranteed to live until the next library call
+const char* ErrorDescription();
+
+/// Convert a return code to a BlasError type
+/// Negative return codes are converted to BlasError::argument_error
+BlasError InterpretError(int a);
+
+int Init(unsigned long precision);
+
+int Finalize();
+
+/// See netlib's documentation on Syrk for usage. Alpha and beta unsupported
+int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, interface::ConstPtr A, unsigned long LDA,
+         interface::Ptr C, unsigned long LDC);
+int Syrk(BlasUplo uplo, BlasTrans trans, unsigned long N, unsigned long K, ConstIndexFunction A, unsigned long LDA,
+         IndexFunction C, unsigned long LDC);
+
+/// See netlib's documentation on Gemm for usage. Alpha and beta unsupported
+int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, interface::ConstPtr A,
+         unsigned long LDA, interface::Ptr B, unsigned long LDB, interface::Ptr C, unsigned long LDC);
+int Gemm(BlasTrans trans_a, BlasTrans trans_b, unsigned long M, unsigned long N, unsigned long K, ConstIndexFunction A,
+         unsigned long LDA, IndexFunction B, unsigned long LDB, IndexFunction C, unsigned long LDC);
+
+}  // namespace apfp
\ No newline at end of file
diff --git a/interface/ApfpInterfaceType.cpp b/interface/ApfpInterfaceType.cpp
new file mode 100644
index 0000000..6862444
--- /dev/null
+++ b/interface/ApfpInterfaceType.cpp
@@ -0,0 +1,94 @@
+#include "ApfpInterfaceType.h"
+#include "DeviceTypes.h"
+
+namespace apfp::interface {
+
+void Init(Ptr value) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_init(value);
+#else
+    mpfr_init(value);
+#endif
+}
+
+void Init2(Ptr value, unsigned long precision) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_init2(value, precision);
+#else
+    mpfr_init(value);
+    mpfr_set_prec(value, precision);
+#endif
+}
+
+void Clear(Ptr value) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_clear(value);
+#else
+    mpfr_clear(value);
+#endif
+}
+
+void Swap(Ptr a, Ptr b) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_swap(a, b);
+#else
+    mpfr_swap(a, b);
+#endif
+}
+
+void Set(Ptr dest, ConstPtr source) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_set(dest, source);
+#else
+    mpfr_set(dest, source, kRoundingMode);
+#endif
+}
+
+void Set(Ptr dest, long int source) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_set_ui(dest, source);
+#else
+    mpfr_set_si(dest, source, kRoundingMode);
+#endif
+}
+
+void Add(Ptr dest, ConstPtr a, ConstPtr b) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_add(dest, a, b);
+#else
+    mpfr_add(dest, a, b, kRoundingMode);
+#endif
+}
+
+void Mul(Ptr dest, ConstPtr a, ConstPtr b) {
+#ifdef APFP_GMP_INTERFACE_TYPE
+    mpf_mul(dest, a, b);
+#else
+    mpfr_mul(dest, a, b, kRoundingMode);
+#endif
+}
+
+Wrapper::~Wrapper() {
+    Clear(data_);
+}
+
+Wrapper::Wrapper() {
+    Init(data_);
+}
+
+Wrapper::Wrapper(unsigned long precision) {
+    Init2(data_, precision);
+}
+
+Wrapper::Wrapper(Wrapper&& other) {
+    Swap(data_, other.data_);
+    Clear(other.data_);
+}
+
+Wrapper& Wrapper::operator=(Wrapper&& other) {
+    Swap(data_, other.data_);
+    Clear(other.data_);
+    return *this;
+}
+
+}  // namespace apfp::interface
\ No newline at end of file
diff --git a/interface/ApfpInterfaceType.h b/interface/ApfpInterfaceType.h
new file mode 100644
index 0000000..57ed2bf
--- /dev/null
+++ b/interface/ApfpInterfaceType.h
@@ -0,0 +1,70 @@
+#pragma once
+#include <gmp.h>
+#include <mpfr.h>
+
+#include "Config.h"
+
+/* This header abstracts away the choice of MPFR or GMP in the interface
+ * It defines four types: Value, Ptr, ConstPtr, Wrapper
+ * The first three directly correspond to MPFR/GMP types
+ * The last one is a wrapper that manages the memory footprint with RAII
+ */
+namespace apfp::interface {
+
+#ifdef APFP_GMP_INTERFACE_TYPE  // Interface with GMP types
+using Value = mpf_t;
+using Ptr = mpf_ptr;
+using ConstPtr = mpf_srcptr;
+#else
+#include <mpfr.h>
+using Value = mpfr_t;
+using Ptr = mpfr_ptr;
+using ConstPtr = mpfr_srcptr;
+#endif
+
+void Init(Ptr value);
+
+void Init2(Ptr value, unsigned long precision);
+
+void Clear(Ptr value);
+
+void Swap(Ptr a, Ptr b);
+
+void Set(Ptr dest, ConstPtr source);
+
+void Set(Ptr dest, long int source);
+
+void Add(Ptr dest, ConstPtr a, ConstPtr b);
+
+void Mul(Ptr dest, ConstPtr a, ConstPtr b);
+
+/// Smart pointer-like wrapper class for GMP/MPFR types
+class Wrapper {
+    Value data_;
+
+   public:
+    ~Wrapper();
+
+    Wrapper();
+
+    Wrapper(unsigned long precision);
+
+    Wrapper(Wrapper&&);
+
+    Wrapper(Wrapper&) = delete;
+
+    Wrapper& operator=(const Wrapper&) = delete;
+
+    Wrapper& operator=(Wrapper&&);
+
+    // This decays to the pointer type
+    Ptr get() {
+        return data_;
+    }
+
+    ConstPtr get() const {
+        return data_;
+    }
+};
+
+}  // namespace apfp::interface
\ No newline at end of file
diff --git a/scripts/run_simulation.sh b/scripts/run_simulation.sh
index cee9fbd..f2725f4 100755
--- a/scripts/run_simulation.sh
+++ b/scripts/run_simulation.sh
@@ -1,13 +1,65 @@
 #!/bin/bash
 
 sizes=(1 3 4 7 9 15 16 17 31 33 41)
+small_sizes=(1 3 4 7 9 15 16 17)
+large_sizes=(31 33 41)
 batch_size=12
 
-for n in "${sizes[@]}" 
+
+for n in "${small_sizes[@]}" 
+do
+    for m in "${small_sizes[@]}" 
+    do
+        for k in "${small_sizes[@]}"
+        do
+            echo $n $m $k 1>&2
+            (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &
+
+            if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
+                wait -n
+            fi
+        done
+    done
+done
+
+
+for n in "${small_sizes[@]}" 
+do
+    for m in "${large_sizes[@]}" 
+    do
+        for k in "${large_sizes[@]}"
+        do
+            echo $n $m $k 1>&2
+            (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &
+
+            if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
+                wait -n
+            fi
+        done
+    done
+done
+
+for n in "${large_sizes[@]}" 
+do
+    for m in "${small_sizes[@]}" 
+    do
+        for k in "${large_sizes[@]}"
+        do
+            echo $n $m $k 1>&2
+            (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &
+
+            if [[ $(jobs -r -p | wc -l) -ge $batch_size ]]; then
+                wait -n
+            fi
+        done
+    done
+done
+
+for n in "${large_sizes[@]}" 
 do
-    for m in "${sizes[@]}" 
+    for m in "${large_sizes[@]}" 
     do
-        for k in "${sizes[@]}"
+        for k in "${small_sizes[@]}"
         do
             echo $n $m $k 1>&2
             (./TestSimulation $n $m $k | tee sim_output.${n}.${m}.${k}.txt) &