diff --git a/.github/workflows/build-and-test.sh b/.github/workflows/build-and-test.sh
old mode 100755
new mode 100644
index 69fd8a71a..f8939c8f0
--- a/.github/workflows/build-and-test.sh
+++ b/.github/workflows/build-and-test.sh
@@ -34,7 +34,7 @@ if [ "$PLUGIN_NAME" == "xdc" ] || [ "$PLUGIN_NAME" == "sdc" ]; then
 fi 
 
 export CXXFLAGS=-Werror
-make UHDM_INSTALL_DIR=`pwd`/env/conda/envs/yosys-plugins/ ${PLUGIN_NAME}.so -j`nproc`
+make UHDM_INSTALL_DIR=`pwd`/env/conda/envs/yosys-plugins/ VTR_INSTALL_DIR=`pwd`/env/conda/envs/yosys-plugins ${PLUGIN_NAME}.so -j`nproc`
 unset CXXFLAGS
 
 end_section
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 95737de5f..80aa350fc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,6 +38,7 @@ jobs:
           - systemverilog
           - uhdm
           - dsp-ff
+          - parmys
 
     steps:
 
diff --git a/.github/workflows/licensing.yml b/.github/workflows/licensing.yml
index 60e54f613..6d5302ef5 100644
--- a/.github/workflows/licensing.yml
+++ b/.github/workflows/licensing.yml
@@ -33,5 +33,12 @@ jobs:
           ./design_introspection-plugin/tests/selection_to_tcl_list/selection_to_tcl_list.v
           ./third_party/minilitex_ddr_arty/minilitex_ddr_arty.v
           ./third_party/VexRiscv_Lite/VexRiscv_Lite.v
+          ./third_party/vtr_flow/verilog/eltwise_layer.v
+          ./third_party/vtr_flow/verilog/raygentop.v
+          ./third_party/vtr_flow/verilog/hard_block_include.v
+          ./third_party/vtr_flow/arch/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
+          ./third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
+          ./third_party/vtr_flow/primitives.v
         third_party: |
           ./third_party/googletest/
+          .third_party/mips32r1_core/
diff --git a/.gitmodules b/.gitmodules
index 918b2bfde..9c7ba5744 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "third_party/make-env"]
 	path = third_party/make-env
 	url = https://github.com/SymbiFlow/make-env.git
+[submodule "third_party/mips32r1_core"]
+	path = third_party/mips32r1_core
+	url = https://github.com/grantae/mips32r1_core.git
diff --git a/Makefile b/Makefile
index 8fc815dd1..e789ab134 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@
 # TODO: pass as -D to gcc so that modules can provide e.g. --version flags.
 PLUGIN_VERSION = 1.20230808
 
-PLUGIN_LIST := fasm xdc params sdc ql-iob design_introspection integrateinv ql-qlf systemverilog uhdm dsp-ff
+PLUGIN_LIST := fasm xdc params sdc ql-iob design_introspection integrateinv ql-qlf systemverilog uhdm dsp-ff parmys
 PLUGINS := $(foreach plugin,$(PLUGIN_LIST),$(plugin).so)
 PLUGINS_INSTALL := $(foreach plugin,$(PLUGIN_LIST),install_$(plugin))
 PLUGINS_CLEAN := $(foreach plugin,$(PLUGIN_LIST),clean_$(plugin))
diff --git a/environment.yml b/environment.yml
index a4cf46c24..22615b4c1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -22,3 +22,4 @@ dependencies:
   - litex-hub::yosys=0.17_7_g990c9b8e1=20220512_085338_py37
   - litex-hub::surelog
   - litex-hub::iverilog
+  - cas-atlantic::vtr-libs
\ No newline at end of file
diff --git a/parmys-plugin/Makefile b/parmys-plugin/Makefile
new file mode 100644
index 000000000..dbb3eb11e
--- /dev/null
+++ b/parmys-plugin/Makefile
@@ -0,0 +1,93 @@
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+PLUGIN_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
+
+NAME = parmys
+SOURCES = parmys.cc \
+		  parmys_arch.cc \
+		  parmys_update.cc \
+		  parmys_utils.cc \
+		  parmys_resolve.cc \
+		  core/adder.cc \
+		  utils/enum_str.cc \
+		  mapping/mixing_optimization.cc \
+		  utils/read_xml_config_file.cc \
+		  utils/odin_error.cc \
+		  utils/odin_util.cc \
+		  netlist/netlist_statistic.cc \
+		  netlist/netlist_utils.cc \
+		  netlist/netlist_cleanup.cc \
+		  netlist/netlist_visualizer.cc \
+		  netlist/node_utils.cc \
+		  core/multiplier.cc \
+		  core/subtractor.cc \
+		  mapping/hard_soft_logic_mixer.cc \
+		  mapping/odin_ii.cc \
+		  utils/string_cache.cc \
+		  mapping/partial_map.cc \
+		  core/hard_block.cc \
+		  core/block_memory.cc \
+		  core/memory.cc \
+		  utils/hash_table.cc \
+		  utils/ast_util.cc
+
+VTR_INSTALL_DIR ?= /usr/local
+
+include ../Makefile_plugin.common
+
+CXXFLAGS += -std=c++14 -Wall -W -Wextra \
+            -Wno-deprecated-declarations \
+            -Wno-unused-parameter \
+            -I. \
+			-Icore \
+			-Imapping \
+			-Inetlist \
+			-Iutils \
+			-I${VTR_INSTALL_DIR}/include/libarchfpga \
+			-I${VTR_INSTALL_DIR}/include/liblog \
+			-I${VTR_INSTALL_DIR}/include/libpugiutil \
+			-I${VTR_INSTALL_DIR}/include/libpugixml \
+			-I${VTR_INSTALL_DIR}/include/librtlnumber \
+			-I${VTR_INSTALL_DIR}/include/libvtrutil
+
+LDFLAGS += -L${VTR_INSTALL_DIR}/lib \
+		   -L${VTR_INSTALL_DIR}/bin
+
+LDLIBS += -larchfpga \
+		  -lvtrutil \
+		  -llog \
+		  -lpugixml \
+		  -lpugiutil \
+		  -lrtlnumber \
+		  -lpthread
+
+TECHLIBS_DIR = techlibs
+VERILOG_MODULES = adff2dff.v \
+				  adffe2dff.v \
+				  aldff2dff.v \
+				  aldffe2dff.v \
+				  vtr_primitives.v
+
+install_modules:
+	$(foreach f, $(wildcard $(TECHLIBS_DIR)/*), install -D $(f) $(YOSYS_DATA_DIR)/parmys/$(notdir $(f));)
+
+install: install_modules
+
+clean_modules:
+	rm -rf ./third_party
+
+clean: clean_modules
\ No newline at end of file
diff --git a/parmys-plugin/core/adder.cc b/parmys-plugin/core/adder.cc
new file mode 100644
index 000000000..ff920398a
--- /dev/null
+++ b/parmys-plugin/core/adder.cc
@@ -0,0 +1,1411 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "adder.h"
+#include "multiplier.h"
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include "subtractor.h"
+#include <string.h>
+
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+#include "parmys_utils.h"
+
+using vtr::t_linked_vptr;
+
+USING_YOSYS_NAMESPACE
+
+t_model *hard_adders = NULL;
+t_linked_vptr *add_list = NULL;
+t_linked_vptr *processed_adder_list = NULL;
+t_linked_vptr *chain_list = NULL;
+int total = 0;
+int *adder = NULL;
+int min_add = 0;
+int min_threshold_adder = 0;
+
+void init_split_adder(nnode_t *node, nnode_t *ptr, int a, int sizea, int b, int sizeb, int cin, int cout, int index, int flag, netlist_t *netlist);
+static void cleanup_add_old_node(nnode_t *nodeo, netlist_t *netlist);
+
+/*---------------------------------------------------------------------------
+ * (function: init_add_distribution)
+ *  For adder, the output will only be the maxim input size + 1
+ *-------------------------------------------------------------------------*/
+void init_add_distribution()
+{
+    oassert(hard_adders != NULL);
+
+    int len = hard_adders->inputs->size + hard_adders->inputs->next->size + 1;
+    adder = (int *)vtr::calloc(len, sizeof(int));
+}
+
+/* These values are collected during the unused logic removal sweep */
+extern long adder_chain_count;
+extern long longest_adder_chain;
+extern long total_adders;
+
+extern double geomean_addsub_length;
+extern double sum_of_addsub_logs;
+
+void report_add_distribution()
+{
+    if (hard_adders == NULL)
+        return;
+
+    log("\nHard adder Distribution\n");
+    log("============================\n");
+    log("\n");
+    log("\nTotal # of chains = %ld\n", adder_chain_count);
+
+    log("\nHard adder chain Details\n");
+    log("============================\n");
+
+    log("\n");
+    log("\nThe Number of Hard Block adders in the Longest Chain: %ld\n", longest_adder_chain);
+
+    log("\n");
+    log("\nThe Total Number of Hard Block adders: %ld\n", total_adders);
+
+    log("\n");
+    log("\nGeometric mean adder/subtractor chain length: %.2f\n", geomean_addsub_length);
+
+    // vtr::free(adder);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: find_hard_adders)
+ *-------------------------------------------------------------------------*/
+void find_hard_adders()
+{
+    hard_adders = Arch.models;
+    // Disable the size in configuration file.(The threshold for the extra bits).
+    // min_add = configuration.min_hard_adder;
+    min_threshold_adder = configuration.min_threshold_adder;
+
+    while (hard_adders != NULL) {
+        if (strcmp(hard_adders->name, "adder") == 0) {
+            init_add_distribution();
+            return;
+        } else {
+            hard_adders = hard_adders->next;
+        }
+    }
+
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: declare_hard_adder)
+ *-------------------------------------------------------------------------*/
+void declare_hard_adder(nnode_t *node)
+{
+    t_adder *tmp;
+    int width_a, width_b, width_sumout;
+
+    /* See if this size instance of adder exists? */
+    if (hard_adders == NULL)
+        warning_message(NETLIST, node->loc, "%s\n", "Instantiating adder where adders do not exist");
+
+    tmp = (t_adder *)hard_adders->instances;
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width_sumout = node->output_port_sizes[1];
+
+    while (tmp != NULL) {
+        if ((tmp->size_a == width_a) && (tmp->size_b == width_b) && (tmp->size_sumout == width_sumout))
+            return;
+        else
+            tmp = tmp->next;
+    }
+
+    /* Does not exist - must create an instance */
+    tmp = (t_adder *)vtr::malloc(sizeof(t_adder));
+    tmp->next = (t_adder *)hard_adders->instances;
+    hard_adders->instances = tmp;
+    tmp->size_a = width_a;
+    tmp->size_b = width_b;
+    tmp->size_cin = 1;
+    tmp->size_cout = 1;
+    tmp->size_sumout = width_sumout;
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: instantiate_hard_addier )
+ *-------------------------------------------------------------------------*/
+void instantiate_hard_adder(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    char *new_name;
+    int len, sanity;
+
+    declare_hard_adder(node);
+
+    /* Need to give node proper name */
+    len = strlen(node->name);
+    len = len + 20; /* 20 chars should hold mul specs */
+    new_name = (char *)vtr::malloc(len);
+
+    /* wide input first :) identical branches! */
+    // if (node->input_port_sizes[0] > node->input_port_sizes[1])
+    // 	sanity = odin_sprintf(new_name, "%s", node->name);
+    // else
+    sanity = odin_sprintf(new_name, "%s", node->name);
+
+    if (new_name)
+        vtr::free(new_name);
+
+    if (len <= sanity) /* buffer not large enough */
+        oassert(false);
+
+    /* Give names to the output pins */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        if (node->output_pins[i]->name == NULL) {
+            len = strlen(node->name) + 20; /* 6 chars for pin idx */
+            new_name = (char *)vtr::malloc(len);
+            odin_sprintf(new_name, "%s[%d]", node->name, node->output_pins[i]->pin_node_idx);
+            node->output_pins[i]->name = new_name;
+        }
+    }
+
+    node->traverse_visited = mark;
+    return;
+}
+
+/*----------------------------------------------------------------------------
+ * function: add_the_blackbox_for_adds()
+ *--------------------------------------------------------------------------*/
+void add_the_blackbox_for_adds_yosys(Yosys::Design *design)
+{
+
+    int hard_add_inputs, hard_add_outputs;
+    t_adder *adds;
+    t_model_ports *ports;
+    char *pa, *pb, *psumout, *pcin, *pcout;
+
+    /* Check to make sure this target architecture has hard adders */
+    if (hard_adders == NULL)
+        return;
+
+    /* Get the names of the ports for the adder */
+    ports = hard_adders->inputs;
+    pcin = ports->name;
+    ports = ports->next;
+    pb = ports->name;
+    ports = ports->next;
+    pa = ports->name;
+
+    ports = hard_adders->outputs;
+    psumout = ports->name;
+    ports = ports->next;
+    pcout = ports->name;
+
+    /* find the adder devices in the tech library */
+    adds = (t_adder *)(hard_adders->instances);
+    if (adds == NULL) /* No adders instantiated */
+        return;
+
+    /* simplified way of getting the multsize, but fine for quick example */
+    while (adds != NULL) {
+
+        Yosys::RTLIL::Module *module = nullptr;
+
+        Yosys::hashlib::dict<Yosys::RTLIL::IdString, std::pair<int, bool>> wideports_cache;
+
+        module = new Yosys::RTLIL::Module;
+        module->name = Yosys::RTLIL::escape_id("adder");
+
+        if (design->module(module->name))
+            Yosys::log_error("Duplicate definition of module %s!\n", Yosys::log_id(module->name));
+        design->add(module);
+
+        /* add the inputs */
+        hard_add_inputs = adds->size_a + adds->size_b + adds->size_cin;
+        for (int i = 0; i < hard_add_inputs; i++) {
+            std::string w_name;
+            if (i < adds->size_a) {
+                w_name = Yosys::stringf("%s[%d]", pa, i);
+            } else if (i < hard_add_inputs - adds->size_cin && i >= adds->size_a) {
+                w_name = Yosys::stringf("%s[%d]", pb, i - adds->size_a);
+            } else {
+                w_name = Yosys::stringf("%s[%d]", pcin, i - adds->size_a - adds->size_b);
+            }
+
+            Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+            wire->port_input = true;
+
+            std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+            if (!wp.first.empty() && wp.second >= 0) {
+                wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                wideports_cache[wp.first].second = true;
+            }
+        }
+
+        /* add the outputs */
+        hard_add_outputs = adds->size_cout + adds->size_sumout;
+        for (int i = 0; i < hard_add_outputs; i++) {
+            std::string w_name;
+            if (i < adds->size_cout) {
+                w_name = Yosys::stringf("%s[%d]", pcout, i);
+            } else {
+                w_name = Yosys::stringf("%s[%d]", psumout, i - adds->size_cout);
+            }
+
+            Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+            wire->port_output = true;
+
+            std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+            if (!wp.first.empty() && wp.second >= 0) {
+                wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                wideports_cache[wp.first].second = false;
+            }
+        }
+
+        handle_wideports_cache(&wideports_cache, module);
+
+        module->fixup_ports();
+        wideports_cache.clear();
+
+        module->attributes[Yosys::ID::blackbox] = Yosys::RTLIL::Const(1);
+
+        adds = adds->next;
+    }
+}
+
+void define_add_function_yosys(nnode_t *node, Yosys::Module *module, Yosys::Design *design)
+{
+
+    oassert(node->input_port_sizes[0] > 0);
+    oassert(node->input_port_sizes[1] > 0);
+    oassert(node->input_port_sizes[2] > 0);
+    oassert(node->output_port_sizes[0] > 0);
+    oassert(node->output_port_sizes[1] > 0);
+
+    std::string cell_type_name = "adder";
+
+    Yosys::IdString celltype = Yosys::RTLIL::escape_id(cell_type_name);
+    Yosys::RTLIL::Cell *cell = module->addCell(NEW_ID, celltype);
+
+    Yosys::hashlib::dict<Yosys::RTLIL::IdString, Yosys::hashlib::dict<int, Yosys::SigBit>> cell_wideports_cache;
+
+    /* Write the input pins*/
+    for (int i = 0; i < node->num_input_pins; i++) {
+        std::string p, q;
+        oassert(node->input_pins[i]->net->num_driver_pins == 1);
+        npin_t *driver_pin = node->input_pins[i]->net->driver_pins[0];
+
+        if (i < node->input_port_sizes[0]) {
+            p = Yosys::stringf("%s[%d]", hard_adders->inputs->next->next->name, i);
+            if (!driver_pin->name)
+                q = driver_pin->node->name;
+            else
+                q = driver_pin->name;
+        } else if (i >= node->input_port_sizes[0] && i < node->input_port_sizes[1] + node->input_port_sizes[0]) {
+            p = Yosys::stringf("%s[%d]", hard_adders->inputs->next->name, i - node->input_port_sizes[0]);
+            if (!driver_pin->name)
+                q = driver_pin->node->name;
+            else
+                q = driver_pin->name;
+        } else {
+            p = Yosys::stringf("%s[%d]", hard_adders->inputs->name, i - (node->input_port_sizes[0] + node->input_port_sizes[1]));
+            if (!driver_pin->name)
+                q = driver_pin->node->name;
+            else
+                q = driver_pin->name;
+        }
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+    }
+
+    /* Write the output pins*/
+    for (int i = 0; i < node->num_output_pins; i++) {
+        std::string p, q;
+        if (i < node->output_port_sizes[0]) {
+            p = Yosys::stringf("%s[%d]", hard_adders->outputs->next->name, i);
+            q = node->output_pins[i]->name;
+        } else {
+            p = Yosys::stringf("%s[%d]", hard_adders->outputs->name, i - node->output_port_sizes[0]);
+            q = node->output_pins[i]->name;
+        }
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+    }
+
+    handle_cell_wideports_cache(&cell_wideports_cache, design, module, cell);
+
+    return;
+}
+
+/*-----------------------------------------------------------------------
+ * (function: init_split_adder)
+ * ##################################
+ * TODO the soft logic adders can now be splitted at the source, we could tap onto that and merge these function for
+ * simplicicity and would also make sure to keep the allocation at one place
+ *###################################
+ *	Create a carry chain adder when spliting. Inputs are connected
+ *	to original pins, output pins are set to NULL for later connecting
+ *	flag = 0: all adders are hard logic block; flag = 1: the last adder in the chain is soft logic block
+ *---------------------------------------------------------------------*/
+void init_split_adder(nnode_t *node, nnode_t *ptr, int a, int sizea, int b, int sizeb, int cin, int cout, int index, int flag, netlist_t *netlist)
+{
+    int flaga = 0, flagb = 0;
+    int current_sizea, current_sizeb;
+    int aa = 0, bb = 0, num = 0;
+
+    // if the input of the first cin is generated by a dummy adder added
+    // to the start of the chain, then an offset is needed to compensate
+    // for that in various positions in the code, otherwise the offset is 0
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Copy properties from original node */
+    ptr->type = node->type;
+    ptr->bit_width = node->bit_width;
+    ptr->related_ast_node = node->related_ast_node;
+    ptr->traverse_visited = node->traverse_visited;
+    ptr->node_data = NULL;
+
+    /* decide the current size of input a and b */
+    if (flag == 0) {
+        // increase input sizes by one if a dummy adder is
+        // added to feed the first cin in the chain
+        current_sizea = (a + offset) - sizea * index;
+        current_sizeb = (b + offset) - sizeb * index;
+
+        if (current_sizea >= sizea)
+            current_sizea = sizea;
+        else if (current_sizea <= 0) {
+            current_sizea = sizea;
+            flaga = 1;
+        } else {
+            aa = current_sizea;
+            current_sizea = sizea;
+            flaga = 2;
+        }
+
+        if (current_sizeb >= sizeb)
+            current_sizeb = sizeb;
+        else if (current_sizeb <= 0) {
+            current_sizeb = sizeb;
+            flagb = 1;
+        } else {
+            bb = current_sizeb;
+            current_sizeb = sizeb;
+            flagb = 2;
+        }
+    } else {
+        if (sizea != 0)
+            current_sizea = sizea;
+        else
+            current_sizea = 1;
+        if (sizeb != 0)
+            current_sizeb = sizeb;
+        else
+            current_sizeb = 1;
+    }
+
+    /* Set new port sizes and parameters */
+    ptr->num_input_port_sizes = 3;
+    ptr->input_port_sizes = (int *)vtr::malloc(3 * sizeof(int));
+    ptr->input_port_sizes[0] = current_sizea;
+    ptr->input_port_sizes[1] = current_sizeb;
+    ptr->input_port_sizes[2] = cin;
+    ptr->num_output_port_sizes = 2;
+    ptr->output_port_sizes = (int *)vtr::malloc(2 * sizeof(int));
+    ptr->output_port_sizes[0] = cout;
+
+    /* The size of output port sumout equals the maxim size of sizea and sizeb  */
+    if (current_sizea > current_sizeb)
+        ptr->output_port_sizes[1] = current_sizea;
+    else
+        ptr->output_port_sizes[1] = current_sizeb;
+
+    /* Set the number of pins and re-locate previous pin entries */
+    ptr->num_input_pins = current_sizea + current_sizeb + cin;
+    ptr->input_pins = (npin_t **)vtr::malloc(sizeof(void *) * (current_sizea + current_sizeb + cin));
+    // if flaga or flagb = 1, the input pins should be empty.
+    if (flaga == 1) {
+        for (int i = 0; i < current_sizea; i++)
+            ptr->input_pins[i] = NULL;
+    } else if (flaga == 2) {
+        if (index == 0) {
+            ptr->input_pins[0] = NULL;
+            if (sizea > 1) {
+                for (int i = 1; i < aa; i++) {
+                    ptr->input_pins[i] = node->input_pins[i + index * sizea - 1];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+                for (int i = 0; i < (sizea - aa); i++)
+                    ptr->input_pins[i + aa] = NULL;
+            }
+        } else {
+            for (int i = 0; i < aa; i++) {
+                ptr->input_pins[i] = node->input_pins[i + index * sizea - 1];
+                ptr->input_pins[i]->node = ptr;
+                ptr->input_pins[i]->pin_node_idx = i;
+            }
+            for (int i = 0; i < (sizea - aa); i++)
+                ptr->input_pins[i + aa] = NULL;
+        }
+    } else {
+        if (index == 0 && !configuration.adder_cin_global) {
+            if (flag == 0) {
+                ptr->input_pins[0] = NULL;
+                if (current_sizea > 1) {
+                    for (int i = 1; i < current_sizea; i++) {
+                        ptr->input_pins[i] = node->input_pins[i - 1];
+                        ptr->input_pins[i]->node = ptr;
+                        ptr->input_pins[i]->pin_node_idx = i;
+                    }
+                }
+            } else {
+                for (int i = 0; i < current_sizea; i++) {
+                    ptr->input_pins[i] = node->input_pins[i];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+            }
+        } else {
+            if (flag == 0) {
+                for (int i = 0; i < current_sizea; i++) {
+                    // use the offset to compensate for the dummy adder added at start of the chain
+                    ptr->input_pins[i] = node->input_pins[i + index * sizea - offset];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+            } else {
+                if (sizea == 0)
+                    connect_nodes(netlist->gnd_node, 0, ptr, 0);
+                else {
+                    num = node->input_port_sizes[0];
+                    for (int i = 0; i < current_sizea; i++) {
+                        ptr->input_pins[i] = node->input_pins[i + num - current_sizea];
+                        ptr->input_pins[i]->node = ptr;
+                        ptr->input_pins[i]->pin_node_idx = i;
+                    }
+                }
+            }
+        }
+    }
+
+    if (flagb == 1) {
+        for (int i = 0; i < current_sizeb; i++)
+            ptr->input_pins[i + current_sizeb] = NULL;
+    } else if (flagb == 2) {
+        if (index == 0) {
+            ptr->input_pins[sizea] = NULL;
+            if (current_sizeb > 1) {
+                for (int i = 1; i < bb; i++) {
+                    ptr->input_pins[i + current_sizea] = node->input_pins[i + a + index * sizeb - 1];
+                    ptr->input_pins[i + current_sizea]->node = ptr;
+                    ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+                }
+                for (int i = 0; i < (sizeb - bb); i++)
+                    ptr->input_pins[i + current_sizea + bb] = NULL;
+            }
+        } else {
+            for (int i = 0; i < bb; i++) {
+                ptr->input_pins[i + current_sizea] = node->input_pins[i + a + index * sizeb - 1];
+                ptr->input_pins[i + current_sizea]->node = ptr;
+                ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+            }
+            for (int i = 0; i < (sizeb - bb); i++)
+                ptr->input_pins[i + current_sizea + bb] = NULL;
+        }
+    } else {
+        if (index == 0 && !configuration.adder_cin_global) {
+            if (flag == 0) {
+                ptr->input_pins[sizea] = NULL;
+                if (current_sizeb > 1) {
+                    for (int i = 1; i < current_sizeb; i++) {
+                        ptr->input_pins[i + current_sizea] = node->input_pins[i + a + index * sizeb - 1];
+                        ptr->input_pins[i + current_sizea]->node = ptr;
+                        ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+                    }
+                }
+            } else {
+                for (int i = 0; i < current_sizeb; i++) {
+                    ptr->input_pins[i + current_sizea] = node->input_pins[i + a];
+                    ptr->input_pins[i + current_sizea]->node = ptr;
+                    ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+                }
+            }
+        } else {
+            if (flag == 0) {
+                for (int i = 0; i < current_sizeb; i++) {
+                    ptr->input_pins[i + current_sizea] = node->input_pins[i + a + index * sizeb - offset];
+                    ptr->input_pins[i + current_sizea]->node = ptr;
+                    ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+                }
+            } else {
+                if (sizeb == 0)
+                    connect_nodes(netlist->gnd_node, 0, ptr, current_sizea);
+                else {
+                    num = node->input_port_sizes[0] + node->input_port_sizes[1];
+                    for (int i = 0; i < current_sizeb; i++) {
+                        ptr->input_pins[i + current_sizea] = node->input_pins[i + num - current_sizeb];
+                        ptr->input_pins[i + current_sizea]->node = ptr;
+                        ptr->input_pins[i + current_sizea]->pin_node_idx = i + current_sizea;
+                    }
+                }
+            }
+        }
+    }
+
+    /* Carry_in should be NULL*/
+    for (int i = 0; i < cin; i++) {
+        ptr->input_pins[i + current_sizea + current_sizeb] = NULL;
+    }
+
+    /* output pins */
+    int output;
+    if (current_sizea > current_sizeb)
+        output = current_sizea + cout;
+    else
+        output = current_sizeb + cout;
+
+    ptr->num_output_pins = output;
+    ptr->output_pins = (npin_t **)vtr::malloc(sizeof(void *) * output);
+    for (int i = 0; i < output; i++)
+        ptr->output_pins[i] = NULL;
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_adder)
+ *
+ * This function works to split a adder into several smaller
+ *  adders to better "fit" with the available resources in a
+ *  targeted FPGA architecture.
+ *
+ * This function is at the lowest level since it simply receives
+ *  a adder and is told how to split it.
+ *
+ * Note: In this function, we can do padding(default -1), fix the size of hard block adder.
+ *-----------------------------------------------------------------------*/
+
+void split_adder(nnode_t *nodeo, int a, int b, int sizea, int sizeb, int cin, int cout, int count, netlist_t *netlist)
+{
+    nnode_t **node;
+    int num, lefta = 0, leftb = 0;
+    int max_num = 0;
+    int flag = 0;
+
+    // if the input of the first cin is generated by a dummy adder added
+    // to the start of the chain, then an offset is needed to compensate
+    // for that in various positions in the code, otherwise the offset is 0
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Check for a legitimate split */
+    oassert(nodeo->input_port_sizes[0] == a);
+    oassert(nodeo->input_port_sizes[1] == b);
+
+    node = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * (count));
+
+    for (int i = 0; i < count; i++) {
+        node[i] = allocate_nnode(nodeo->loc);
+        node[i]->name = (char *)vtr::malloc(strlen(nodeo->name) + 20);
+        odin_sprintf(node[i]->name, "%s-%d", nodeo->name, i);
+        if (i == count - 1) {
+            // fixed_hard_adder = 1 then adder need to be exact size;
+            if (configuration.fixed_hard_adder == 1)
+                init_split_adder(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag, netlist);
+            else {
+                if (count == 1) {
+                    lefta = a;
+                    leftb = b;
+                } else {
+                    lefta = (a + 1) % sizea;
+                    leftb = (b + 1) % sizeb;
+                }
+
+                max_num = (lefta >= leftb) ? lefta : leftb;
+                // if fixed_hard_adder = 0, and the left of a and b is more than min_add, then adder need to be remain the same size.
+                if (max_num >= min_add)
+                    init_split_adder(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag, netlist);
+                else {
+                    // Using soft logic to do the addition, No need to pad as the same size
+                    flag = 1;
+                    init_split_adder(nodeo, node[i], a, lefta, b, leftb, cin, cout, i, flag, netlist);
+                }
+            }
+        } else
+            init_split_adder(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag, netlist);
+
+        // store the processed hard adder node for optimization
+        processed_adder_list = insert_in_vptr_list(processed_adder_list, node[i]);
+    }
+
+    chain_information_t *adder_chain = allocate_chain_info();
+    // if flag = 0, the last adder use soft logic, so the count of the chain should be one less
+    if (flag == 0)
+        adder_chain->count = count;
+    else
+        adder_chain->count = count - 1;
+    adder_chain->num_bits = a + b;
+    adder_chain->name = nodeo->name;
+    chain_list = insert_in_vptr_list(chain_list, adder_chain);
+
+    // don't add a dummy adder in the beginning of the chain if the first cin will be connected to a global gnd
+    if ((flag == 0 || count > 1) && !configuration.adder_cin_global) {
+        // connect the a[0] and b[0] of first adder node to ground
+        connect_nodes(netlist->vcc_node, 0, node[0], 0);
+        connect_nodes(netlist->gnd_node, 0, node[0], sizea);
+        // hang the first sumout
+        node[0]->output_pins[1] = allocate_npin();
+        node[0]->output_pins[1]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, 1);
+    }
+
+    if (nodeo->num_input_port_sizes == 2) {
+        // connect the first cin pin to unconn
+        connect_nodes(netlist->pad_node, 0, node[0], node[0]->num_input_pins - 1);
+    } else if (nodeo->num_input_port_sizes == 3) {
+        // remap the first cin pins)
+        remap_pin_to_new_node(nodeo->input_pins[nodeo->num_input_pins - 1], node[0], (node[0]->num_input_pins - 1));
+    }
+    // if (a + 1) % sizea == 0, the a[0] and b[0] of node[count-1] should connect to gound
+    if ((a + 1) % sizea == 0 && (b + 1) % sizeb == 0) {
+        if (flag == 0) {
+            connect_nodes(netlist->gnd_node, 0, node[count - 1], 0);
+            connect_nodes(netlist->gnd_node, 0, node[count - 1], sizea);
+        }
+    }
+
+    // if any input pins beside first cin pins are NULL, connect those pins to unconn
+    for (int i = 0; i < count; i++) {
+        num = node[i]->num_input_pins;
+        for (int j = 0; j < num - 1; j++) {
+            if (node[i]->input_pins[j] == NULL)
+                connect_nodes(netlist->pad_node, 0, node[i], j);
+        }
+    }
+
+    if (configuration.adder_cin_global) {
+        // connect first cin to gnd
+        connect_nodes(netlist->gnd_node, 0, node[0], (node[0]->num_input_pins - 1));
+    }
+
+    // connect cout to next cin
+    for (int i = 1; i < count; i++)
+        connect_nodes(node[i - 1], 0, node[i], (node[i]->num_input_pins - 1));
+
+    // remap the output pins of each adder to nodeo
+    if (count == 1) {
+        if (flag == 0) {
+            for (int j = 0; j < node[0]->num_output_pins - 2; j++) {
+                if (j < nodeo->num_output_pins)
+                    remap_pin_to_new_node(nodeo->output_pins[j], node[0], j + 2);
+                else {
+                    node[0]->output_pins[j + 2] = allocate_npin();
+                    node[0]->output_pins[j + 2]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, j + 2);
+                }
+                // hang the first cout
+                node[0]->output_pins[0] = allocate_npin();
+                node[0]->output_pins[0]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, 0);
+            }
+        } else {
+            for (int j = 0; j < node[0]->num_output_pins - 1; j++)
+                remap_pin_to_new_node(nodeo->output_pins[j], node[0], j + 1);
+            remap_pin_to_new_node(nodeo->output_pins[nodeo->num_output_pins - 1], node[0], 0);
+        }
+    } else {
+        // First adder
+        for (int j = 0; j < node[0]->num_output_pins - 2; j++)
+            remap_pin_to_new_node(nodeo->output_pins[j], node[0], j + 2);
+        // if a dummy adder is added (offset = 1) start from the second adder)
+        for (int i = offset; i < count - 1; i++) {
+            for (int j = 0; j < node[i]->num_output_pins - 1; j++)
+                remap_pin_to_new_node(nodeo->output_pins[i * sizea + j - offset], node[i], j + 1);
+        }
+        // Last adder
+        if (flag == 0) {
+            for (int j = 0; j < node[count - 1]->num_output_pins - 1; j++) {
+                // if a dummy adder is added to this chain (offset = 1), adjust the index of the adder using the offset constant
+                if (((count - 1) * sizea + j - offset) < nodeo->num_output_pins)
+                    remap_pin_to_new_node(nodeo->output_pins[(count - 1) * sizea + j - offset], node[count - 1], j + 1);
+                else {
+                    node[count - 1]->output_pins[j + 1] = allocate_npin();
+                    // Pad outputs with a unique and descriptive name to avoid collisions.
+                    node[count - 1]->output_pins[j + 1]->name = append_string("", "%s~dummy_output~%d~%d", node[count - 1]->name, count - 1, j + 1);
+                }
+            }
+            // Hang the last cout
+            node[count - 1]->output_pins[0] = allocate_npin();
+            // Pad outputs with a unique and descriptive name to avoid collisions.
+            node[count - 1]->output_pins[0]->name = append_string("", "%s~dummy_output~%d~%d", node[count - 1]->name, count - 1, 0);
+        } else {
+            for (int j = 0; j < node[count - 1]->num_output_pins - 1; j++)
+                // if(((count - 1) * sizea + j - 1) < nodeo->num_output_pins)
+                remap_pin_to_new_node(nodeo->output_pins[(count - 1) * sizea + j - 1], node[count - 1], j + 1);
+            if (nodeo->output_pins[nodeo->num_output_pins - 1] != NULL)
+                remap_pin_to_new_node(nodeo->output_pins[nodeo->num_output_pins - 1], node[count - 1], 0);
+            else {
+                node[count - 1]->output_pins[0] = allocate_npin();
+                // Pad outputs with a unique and descriptive name to avoid collisions.
+                node[count - 1]->output_pins[0]->name = append_string("", "%s~dummy_output~%d~%d", node[count - 1]->name, count - 1, 0);
+            }
+        }
+    }
+
+    for (int i = offset; configuration.coarsen && i < count - 1; i++) {
+        for (int j = 0; j < node[i]->num_output_pins - 1; j++) {
+            char *new_output_pin_name = (char *)vtr::malloc((strlen(node[i]->name) + 20) * sizeof(char)); /* 6 chars for pin idx */
+            odin_sprintf(new_output_pin_name, "%s[1]", node[i]->name);
+            node[i]->output_pins[1]->name = new_output_pin_name;
+        }
+    }
+
+    /* Freeing the old node! */
+    cleanup_add_old_node(nodeo, netlist);
+
+    vtr::free(node);
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: iterate_adders)
+ *
+ * This function will iterate over all of the add operations that
+ *	exist in the netlist and perform a splitting so that they can
+ *	fit into a basic hard adder block that exists on the FPGA.
+ *	If the proper option is set, then it will be expanded as well
+ *	to just use a fixed size hard adder.
+ *-----------------------------------------------------------------------*/
+void iterate_adders(netlist_t *netlist)
+{
+    int sizea, sizeb, sizecin; // the size of
+    int a, b;
+    int count, counta, countb;
+    int num = 0;
+    nnode_t *node;
+
+    // offset to the adder size in case a dummy adder is added to
+    // start of the adder chain to feed the first cin with gnd
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Can only perform the optimization if hard adders exist! */
+    if (hard_adders == NULL)
+        return;
+    // In hard block adder, the summand and addend are same size.
+    sizecin = hard_adders->inputs->size;
+    sizeb = hard_adders->inputs->next->size;
+    sizea = hard_adders->inputs->next->size;
+
+    oassert(sizecin == 1);
+
+    while (add_list != NULL) {
+        node = (nnode_t *)add_list->data_vptr;
+        add_list = delete_in_vptr_list(add_list);
+        oassert(node != NULL);
+        if (node->type == HARD_IP)
+            node->type = ADD;
+
+        oassert(node->type == ADD);
+
+        a = node->input_port_sizes[0];
+        b = node->input_port_sizes[1];
+        num = (a >= b) ? a : b;
+        node->bit_width = num;
+        if (num >= min_threshold_adder && num >= min_add) {
+            // if the first cin in a chain is fed by a global input (offset = 0) the adder width is the
+            // input width + 1 (to pass the last cout -> sumout) divided by size of the adder input ports
+            // otherwise (offset = 1) a dummy adder is added to the chain to feed the first cin with gnd
+            // how many adders a can split
+            counta = (a + 1) / sizea + offset;
+            // how many adders b can split
+            countb = (b + 1) / sizeb + offset;
+            // how many adders need to be split
+            if (counta >= countb)
+                count = counta;
+            else
+                count = countb;
+            total++;
+            split_adder(node, a, b, sizea, sizeb, 1, 1, count, netlist);
+        }
+        // Store the node into processed_adder_list if the threshold is bigger than num
+        else
+            processed_adder_list = insert_in_vptr_list(processed_adder_list, node);
+    }
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: clean_adders)
+ *
+ * Clean up the memory by deleting the list structure of adders
+ *	during optimization
+ *-----------------------------------------------------------------------*/
+void clean_adders()
+{
+    while (add_list != NULL)
+        add_list = delete_in_vptr_list(add_list);
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: reduce_operations)
+ *
+ * reduce the operations that are redundant
+ *-----------------------------------------------------------------------*/
+void reduce_operations(netlist_t * /*netlist*/, operation_list op)
+{
+    t_linked_vptr *place = NULL;
+    operation_list oper;
+    switch (op) {
+    case ADD:
+        place = add_list;
+        oper = ADD;
+        break;
+
+    case MULTIPLY:
+        place = mult_list;
+        oper = MULTIPLY;
+        break;
+
+    case MINUS:
+        place = sub_list;
+        oper = MINUS;
+        break;
+
+    default:
+        oper = NO_OP;
+        break;
+    }
+
+    traverse_list(oper, place);
+}
+
+/*-------------------------------------------------------------------------
+ * (function: traverse_list)
+ *
+ * traverse the operation lists
+ *-----------------------------------------------------------------------*/
+void traverse_list(operation_list oper, t_linked_vptr *place)
+{
+    while (place != NULL && place->next != NULL) {
+        match_node(place, oper);
+        place = place->next;
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * (function: match_node)
+ *-------------------------------------------------------------------------*/
+void match_node(t_linked_vptr *place, operation_list oper)
+{
+    int flag = 0;
+    int mark = 0;
+    nnode_t *node = NULL;
+    nnode_t *next_node = NULL;
+    node = (nnode_t *)place->data_vptr;
+    t_linked_vptr *pre = place;
+    t_linked_vptr *next = NULL;
+    if (place->next != NULL)
+        next = place->next;
+    while (next != NULL) {
+        flag = 0;
+        mark = 0;
+        next_node = (nnode_t *)next->data_vptr;
+        if (node->type == next_node->type) {
+            if (node->num_input_pins == next_node->num_input_pins) {
+                flag = match_ports(node, next_node, oper);
+                if (flag == 1) {
+                    mark = match_pins(node, next_node);
+                    if (mark == 1) {
+                        merge_nodes(node, next_node);
+                        remove_list_node(pre, next);
+                    }
+                }
+            }
+        }
+        if (mark == 1)
+            next = pre->next;
+        else {
+            pre = next;
+            next = next->next;
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * (function: match_ports)
+ *-------------------------------------------------------------------------*/
+int match_ports(nnode_t *node, nnode_t *next_node, operation_list oper)
+{
+    int flag = 0;
+    int sign = 0;
+    int mark1 = 1;
+    int mark2 = 1;
+    ast_node_t *ast_node, *ast_node_next;
+    char *component_s[2] = {0};
+    char *component_o[2] = {0};
+    ast_node = node->related_ast_node;
+    ast_node_next = next_node->related_ast_node;
+    /* in case of coarsen blifs, there is no related ast node, so we skip this part */
+    if (ast_node && ast_node->types.operation.op == oper) {
+        traverse_operation_node(ast_node, component_s, oper, &sign);
+        if (sign != 1) {
+            traverse_operation_node(ast_node_next, component_o, oper, &sign);
+            if (sign != 1) {
+                oassert(component_s[0] && component_o[0] && "missing children on operation");
+                switch (oper) {
+                case ADD:
+                case MULTIPLY: {
+                    mark1 = strcmp(component_s[0], component_o[0]);
+                    if (component_s[1] && component_o[1]) {
+                        if (mark1 == 0) {
+                            mark2 = strcmp(component_s[1], component_o[1]);
+                        } else {
+                            mark1 = strcmp(component_s[0], component_o[1]);
+                            mark2 = strcmp(component_s[1], component_o[0]);
+                        }
+                    }
+                } break;
+
+                case MINUS: {
+                    mark1 = strcmp(component_s[0], component_o[0]);
+                    if (mark1 == 0 && component_s[1] && component_o[1]) {
+                        mark2 = strcmp(component_s[1], component_o[1]);
+                    }
+                } break;
+
+                default:
+
+                    break;
+                }
+                if (mark1 == 0 && mark2 == 0) {
+                    flag = 1;
+                }
+            }
+        }
+        for (int i = 0; i < ast_node->num_children; i++) {
+            if (ast_node->children[i]->type != IDENTIFIERS) {
+                vtr::free(component_s[i]);
+            }
+        }
+        for (int i = 0; i < ast_node_next->num_children; i++) {
+            if (ast_node_next->children[i]->type != IDENTIFIERS) {
+                vtr::free(component_o[i]);
+            }
+        }
+    }
+
+    return flag;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: traverse_operation_node)
+ *
+ * search the ast find the couple of components
+ *-----------------------------------------------------------------------*/
+void traverse_operation_node(ast_node_t *node, char *component[], operation_list op, int *mark)
+{
+    if (node == NULL)
+        return;
+
+    if (node->types.operation.op == op) {
+        for (long i = 0; i < node->num_children; i++) {
+            *mark = 0;
+            if (node->children[i]->type != IDENTIFIERS && node->children[i]->type != NUMBERS) {
+                *mark = 1;
+                break;
+            } else {
+                if (node->children[i]->type == IDENTIFIERS) {
+                    component[i] = node->children[i]->types.identifier;
+                } else if (node->children[i]->type == NUMBERS) {
+                    long value = node->children[i]->types.vnumber->get_value();
+                    long len = snprintf(NULL, 0, "%ld", value);
+                    component[i] = (char *)vtr::calloc(len + 1, sizeof(char));
+                    odin_sprintf(component[i], "%ld", value);
+                }
+            }
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * (function: merge_node)
+ *-------------------------------------------------------------------------*/
+void merge_nodes(nnode_t *node, nnode_t *next_node)
+{
+    remove_fanout_pins(next_node);
+    reallocate_pins(node, next_node);
+    free_op_nodes(next_node);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: remove_list_node)
+ *-------------------------------------------------------------------------*/
+void remove_list_node(t_linked_vptr *pre, t_linked_vptr *next)
+{
+    if (next->next != NULL)
+        pre->next = next->next;
+    else
+        pre->next = NULL;
+    vtr::free(next);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: remove_fanout_pins)
+ *-------------------------------------------------------------------------*/
+void remove_fanout_pins(nnode_t *node)
+{
+    for (int i = 0; i < node->num_input_pins; i++) {
+        int j, k;
+        int idx = node->input_pins[i]->unique_id;
+        for (j = 0; j < node->input_pins[i]->net->num_fanout_pins; j++) {
+            if (node->input_pins[i]->net->fanout_pins[j]->unique_id == idx)
+                break;
+        }
+        for (k = j; k < node->input_pins[i]->net->num_fanout_pins - 1; k++) {
+            node->input_pins[i]->net->fanout_pins[k] = node->input_pins[i]->net->fanout_pins[k + 1];
+            node->input_pins[i]->net->fanout_pins[k]->pin_net_idx = k;
+        }
+        node->input_pins[i]->net->fanout_pins[k] = NULL;
+        node->input_pins[i]->net->num_fanout_pins--;
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * (function: reallocate_pins)
+ *-------------------------------------------------------------------------*/
+void reallocate_pins(nnode_t *node, nnode_t *next_node)
+{
+    int pin_idx;
+    nnode_t *input_node = NULL;
+    nnet_t *net = NULL;
+    npin_t *pin = NULL;
+    for (int i = 0; i < next_node->num_output_pins; i++) {
+        for (int j = 0; j < next_node->output_pins[i]->net->num_fanout_pins; j++) {
+            if (next_node->output_pins[i]->net->fanout_pins[j]->node != NULL) {
+                input_node = next_node->output_pins[i]->net->fanout_pins[j]->node;
+                net = node->output_pins[i]->net;
+                pin_idx = next_node->output_pins[i]->net->fanout_pins[j]->pin_node_idx;
+                pin = input_node->input_pins[pin_idx];
+                add_fanout_pin_to_net(net, pin);
+            } else {
+                free_npin(next_node->output_pins[i]->net->fanout_pins[j]);
+            }
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * (function: free_op_nodes)
+ *-------------------------------------------------------------------------*/
+void free_op_nodes(nnode_t *node)
+{
+    for (int i = 0; i < node->num_output_pins; i++) {
+        if (node->output_pins[i]->net != NULL) {
+            free_nnet(node->output_pins[i]->net);
+        }
+    }
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: match_pins)
+ *-------------------------------------------------------------------------*/
+int match_pins(nnode_t *node, nnode_t *next_node)
+{
+    for (int i = 0; i < node->num_input_pins; i++) {
+        for (int j = 0; j < node->input_pins[i]->net->num_driver_pins; j++) {
+            bool found = false;
+            long id = node->input_pins[i]->net->driver_pins[j]->unique_id;
+            for (int k = 0; k < next_node->num_input_pins && !found; k++) {
+                for (int l = 0; l < next_node->input_pins[k]->net->num_driver_pins; l++) {
+                    if (id == next_node->input_pins[k]->net->driver_pins[l]->unique_id) {
+                        found = true;
+                        break;
+                    }
+                }
+            }
+            if (!found)
+                return -1;
+        }
+    }
+
+    return 1;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * connect adder type output pin to a node
+ *-------------------------------------------------------------------------------------------*/
+static void connect_output_pin_to_node(int *width, int current_pin, int output_pin_id, nnode_t *node, nnode_t *current_adder, short subtraction)
+{
+    // output
+    if (subtraction) {
+        remap_pin_to_new_node(node->output_pins[current_pin], current_adder, output_pin_id);
+    } else {
+        npin_t *node_pin_select =
+          node->output_pins[(node->num_input_port_sizes == 2) ? current_pin : (current_pin < width[output_pin_id] - 1) ? current_pin + 1 : 0];
+        if (node_pin_select) {
+            if (node_pin_select->type != NO_ID || (node->num_input_port_sizes == 2)) {
+                remap_pin_to_new_node(node_pin_select, current_adder, output_pin_id);
+            } else {
+                current_adder->output_pins[output_pin_id] = allocate_npin();
+                current_adder->output_pins[output_pin_id]->name = append_string("", "%s~dummy_output~%d", current_adder->name, output_pin_id);
+            }
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------------------------
+ * make a single half-adder (can do unary subtraction, binary subtraction and addition)
+ *-------------------------------------------------------------------------------------------*/
+static nnode_t *make_adder(operation_list funct, nnode_t *current_adder, nnode_t *previous_carry, int *width, int current_pin, netlist_t *netlist,
+                           nnode_t *node, short subtraction, short mark)
+{
+    // make a 2 bit 0r 3 bit sum or carry based on previous carry
+    nnode_t *new_funct = NULL;
+    short is_three_port_gate = 0;
+
+    if (previous_carry == netlist->gnd_node) {
+        if (funct == ADDER_FUNC)
+            new_funct = make_2port_gate(LOGICAL_XOR, 1, 1, 1, node, mark);
+        else if (funct == CARRY_FUNC)
+            new_funct = make_2port_gate(LOGICAL_AND, 1, 1, 1, node, mark);
+    } else if (previous_carry == netlist->vcc_node) {
+        if (funct == ADDER_FUNC)
+            new_funct = make_2port_gate(LOGICAL_XNOR, 1, 1, 1, node, mark);
+        else if (funct == CARRY_FUNC)
+            new_funct = make_2port_gate(LOGICAL_OR, 1, 1, 1, node, mark);
+    } else {
+        new_funct = make_3port_gate(funct, 1, 1, 1, 1, node, mark);
+        connect_nodes(previous_carry, 0, new_funct, 0);
+        is_three_port_gate = 1;
+    }
+
+    // copy the input pin of a half-adder to another function (CARRY or ADDER)
+    if (current_adder != NULL) {
+        add_input_pin_to_node(new_funct, copy_input_npin(current_adder->input_pins[0 + is_three_port_gate]), 0 + is_three_port_gate);
+        add_input_pin_to_node(new_funct, copy_input_npin(current_adder->input_pins[1 + is_three_port_gate]), 1 + is_three_port_gate);
+    }
+    // create one from scratch
+    else {
+        // connect input a
+        if (current_pin < width[1]) {
+            npin_t *temp_pin = node->input_pins[current_pin];
+            oassert(temp_pin->net->num_driver_pins <= 1);
+            if (!temp_pin->net->num_driver_pins || temp_pin->net->driver_pins[0]->node->type == GND_NODE) {
+                connect_nodes(netlist->gnd_node, 0, new_funct, 0 + is_three_port_gate);
+                remove_fanout_pins_from_net(temp_pin->net, temp_pin, temp_pin->pin_net_idx);
+            } else if (temp_pin->net->driver_pins[0]->node->type == VCC_NODE) {
+                connect_nodes(netlist->vcc_node, 0, new_funct, 0 + is_three_port_gate);
+                remove_fanout_pins_from_net(temp_pin->net, temp_pin, temp_pin->pin_net_idx);
+            } else {
+                remap_pin_to_new_node(temp_pin, new_funct, 0 + is_three_port_gate);
+            }
+        } else {
+            connect_nodes(netlist->gnd_node, 0, new_funct, 0 + is_three_port_gate);
+        }
+
+        // connect input b
+        if (current_pin < width[2]) {
+            // pin a is neighbor to pin b
+            npin_t *temp_pin = node->input_pins[current_pin + width[1]];
+            oassert(temp_pin->net->num_driver_pins <= 1);
+            if (temp_pin->net->num_driver_pins == 0 || temp_pin->net->driver_pins[0]->node->type == GND_NODE) {
+                nnode_t *attach_to = (subtraction) ? netlist->vcc_node : netlist->gnd_node;
+                connect_nodes(attach_to, 0, new_funct, 1 + is_three_port_gate);
+                remove_fanout_pins_from_net(temp_pin->net, temp_pin, temp_pin->pin_net_idx);
+            } else if (temp_pin->net->driver_pins[0]->node->type == VCC_NODE) {
+                nnode_t *attach_to = (subtraction) ? netlist->gnd_node : netlist->vcc_node;
+                connect_nodes(attach_to, 0, new_funct, 1 + is_three_port_gate);
+                remove_fanout_pins_from_net(temp_pin->net, temp_pin, temp_pin->pin_net_idx);
+            } else {
+                if (subtraction) {
+                    nnode_t *new_not_cells = make_not_gate(node, mark);
+                    remap_pin_to_new_node(temp_pin, new_not_cells, 0);
+                    connect_nodes(new_not_cells, 0, new_funct, 1 + is_three_port_gate);
+                } else {
+                    remap_pin_to_new_node(temp_pin, new_funct, 1 + is_three_port_gate);
+                }
+            }
+        } else {
+            nnode_t *attach_to = (subtraction) ? netlist->vcc_node : netlist->gnd_node;
+            connect_nodes(attach_to, 0, new_funct, 1 + is_three_port_gate);
+        }
+    }
+    return new_funct;
+}
+
+void instantiate_add_w_carry_block(int *width, nnode_t *node, short mark, netlist_t *netlist, short subtraction)
+{
+    nnode_t *previous_carry = (subtraction) ? netlist->vcc_node : netlist->gnd_node;
+
+    for (int i = 0; i < width[0]; i++) {
+        /* set of flags for building purposes */
+        short construct_last_carry_flag = (i != width[0] - 1 || !subtraction) ? 1 : 0;
+
+        // build Ripple Carry Adder
+        nnode_t *current_adder = make_adder(ADDER_FUNC, NULL, previous_carry, width, i, netlist, node, subtraction, mark);
+        if (construct_last_carry_flag)
+            previous_carry = make_adder(CARRY_FUNC, current_adder, previous_carry, width, i, netlist, node, subtraction, mark);
+
+        connect_output_pin_to_node(width, i, 0, node, current_adder, subtraction);
+    }
+}
+
+/**
+ * -------------------------------------------------------------------------
+ * (function: cleanup_add_old_node)
+ *
+ * @brief <clean up nodeo, a high level ADD node>
+ * In split_adder function, nodeo is splitted to small adders,
+ * while because of the complexity of input pin connections they have not been
+ * remapped to new nodes, they just copied and added to new nodes. This function
+ * will detach input pins from the nodeo. Moreover, it will connect the net of
+ * unconnected output signals to the GND node, detach the pin from nodeo and
+ * free the output pins to avoid memory leak.
+ *
+ * @param nodeo representing the old adder node
+ * @param netlist representing the current netlist
+ *-----------------------------------------------------------------------*/
+static void cleanup_add_old_node(nnode_t *nodeo, netlist_t *netlist)
+{
+    /* Disconnecting input pins from the old node side */
+    for (int i = 0; i < nodeo->num_input_pins; i++) {
+        nodeo->input_pins[i] = NULL;
+    }
+
+    /* connecting the extra output pins to the gnd node */
+    for (int i = 0; i < nodeo->num_output_pins; i++) {
+        npin_t *output_pin = nodeo->output_pins[i];
+
+        if (output_pin && output_pin->node) {
+            /* for now we just pass the signals directly through */
+            npin_t *zero_pin = get_zero_pin(netlist);
+            int idx_2_buffer = zero_pin->pin_net_idx;
+
+            // Dont eliminate the buffer if there are multiple drivers or the AST included it
+            if (output_pin->net->num_driver_pins <= 1) {
+                /* join all fanouts of the output net with the input pins net */
+                join_nets(zero_pin->net, output_pin->net);
+
+                /* erase the pointer to this buffer */
+                zero_pin->net->fanout_pins[idx_2_buffer] = NULL;
+            }
+
+            free_npin(zero_pin);
+            free_npin(output_pin);
+
+            /* Disconnecting output pins from the old node side */
+            nodeo->output_pins[i] = NULL;
+        }
+    }
+
+    // CLEAN UP
+    free_nnode(nodeo);
+}
+
+/**
+ *-------------------------------------------------------------------------------------------
+ * (function: check_missing_ports )
+ *
+ * @brief check for missing ports such as carry-in/out in case of
+ * dealing with generated netlist from Yosys blif file.
+ *
+ * @param node pointing to the netlist node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ *-----------------------------------------------------------------------------------------*/
+nnode_t *check_missing_ports(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    nnode_t *new_node = NULL;
+    int num_input_port = node->num_input_port_sizes;
+
+    /* check for operations that has 2 operands */
+    if (num_input_port == 2) {
+        int in_port1_size = node->input_port_sizes[0];
+        int in_port2_size = node->input_port_sizes[1];
+        int out_port_size = (in_port1_size >= in_port2_size) ? in_port1_size + 1 : in_port2_size + 1;
+
+        new_node = make_3port_gate(node->type, in_port1_size, in_port2_size, 1, out_port_size, node, traverse_mark_number);
+
+        /* copy attributes */
+        copy_attribute(new_node->attributes, node->attributes);
+
+        for (int i = 0; i < in_port1_size; i++) {
+            remap_pin_to_new_node(node->input_pins[i], new_node, i);
+        }
+
+        for (int i = 0; i < in_port2_size; i++) {
+            remap_pin_to_new_node(node->input_pins[i + in_port1_size], new_node, i + in_port1_size);
+        }
+
+        /* adding a cin connected to GND */
+        npin_t *cin_pin = get_zero_pin(netlist);
+        cin_pin->type = INPUT;
+        cin_pin->mapping = vtr::strdup("cin");
+
+        add_input_pin_to_node(new_node, cin_pin, new_node->num_input_pins - 1);
+
+        // moving the output pins to the new node
+        for (int i = 0; i < out_port_size; i++) {
+            if (i < node->num_output_pins) {
+                remap_pin_to_new_node(node->output_pins[i], new_node, i);
+            } else {
+                npin_t *new_pin1 = allocate_npin();
+                npin_t *new_pin2 = allocate_npin();
+                nnet_t *new_net = allocate_nnet();
+                new_net->name = make_full_ref_name(NULL, NULL, NULL, new_node->name, i);
+                /* hook the output pin into the node */
+                add_output_pin_to_node(new_node, new_pin1, i);
+                /* hook up new pin 1 into the new net */
+                add_driver_pin_to_net(new_net, new_pin1);
+                /* hook up the new pin 2 to this new net */
+                add_fanout_pin_to_net(new_net, new_pin2);
+            }
+        }
+
+        /**
+         * if number of output pins is greater than the max of input pins,
+         * here we connect the exceeded pins to the GND
+         */
+        for (int i = out_port_size; i < node->num_output_pins; i++) {
+            /* creating a buf node */
+            nnode_t *buf_node = make_1port_gate(BUF_NODE, 1, 1, node, traverse_mark_number);
+            /* adding the GND input pin to the buf node */
+            add_input_pin_to_node(buf_node, get_zero_pin(netlist), 0);
+            /* remapping the outpin to buf node */
+            remap_pin_to_new_node(node->output_pins[i], buf_node, 0);
+        }
+
+        // CLEAN UP
+        free_nnode(node);
+    }
+    /* otherwise there is unary minus, like -A. no need for any change */
+    else if (num_input_port == 1) {
+        new_node = node;
+    }
+
+    return new_node;
+}
\ No newline at end of file
diff --git a/parmys-plugin/core/adder.h b/parmys-plugin/core/adder.h
new file mode 100644
index 000000000..3b2c7222d
--- /dev/null
+++ b/parmys-plugin/core/adder.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ADDER_H_
+#define _ADDER_H_
+
+#include "odin_types.h"
+#include "read_xml_arch_file.h"
+#include <vector>
+
+struct t_adder {
+    int size_a;
+    int size_b;
+    int size_cin;
+    int size_sumout;
+    int size_cout;
+    struct t_adder *next;
+};
+
+extern t_model *hard_adders;
+extern vtr::t_linked_vptr *add_list;
+extern vtr::t_linked_vptr *chain_list;
+extern vtr::t_linked_vptr *processed_adder_list;
+extern int total;
+extern int min_add;
+extern int min_threshold_adder;
+
+void init_add_distribution();
+void report_add_distribution();
+void declare_hard_adder(nnode_t *node);
+void instantiate_hard_adder(nnode_t *node, short mark, netlist_t *netlist);
+void find_hard_adders();
+void add_the_blackbox_for_adds_yosys(Yosys::Design *design);
+void define_add_function_yosys(nnode_t *node, Yosys::Module *module, Yosys::Design *design);
+void split_adder(nnode_t *node, int a, int b, int sizea, int sizeb, int cin, int cout, int count, netlist_t *netlist);
+void iterate_adders(netlist_t *netlist);
+void clean_adders();
+void reduce_operations(netlist_t *netlist, operation_list op);
+void traverse_list(operation_list oper, vtr::t_linked_vptr *place);
+void match_node(vtr::t_linked_vptr *place, operation_list oper);
+int match_ports(nnode_t *node, nnode_t *next_node, operation_list oper);
+void traverse_operation_node(ast_node_t *node, char *component[], operation_list op, int *mark);
+void merge_nodes(nnode_t *node, nnode_t *next_node);
+void remove_list_node(vtr::t_linked_vptr *node, vtr::t_linked_vptr *place);
+void remove_fanout_pins(nnode_t *node);
+void reallocate_pins(nnode_t *node, nnode_t *next_node);
+void free_op_nodes(nnode_t *node);
+int match_pins(nnode_t *node, nnode_t *next_node);
+
+void instantiate_add_w_carry_block(int *width, nnode_t *node, short mark, netlist_t *netlist, short subtraction);
+nnode_t *check_missing_ports(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+#endif // _ADDER_H_
diff --git a/parmys-plugin/core/block_memory.cc b/parmys-plugin/core/block_memory.cc
new file mode 100644
index 000000000..1b600771a
--- /dev/null
+++ b/parmys-plugin/core/block_memory.cc
@@ -0,0 +1,2231 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_util.h"
+#include <string.h>
+
+#include "block_memory.h"
+#include "hard_block.h"
+#include "memory.h"
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "partial_map.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+using vtr::t_linked_vptr;
+/* global linked list including block memory instances */
+struct block_memory_information_t block_memories_info;
+
+static block_memory_t *init_block_memory(nnode_t *node, netlist_t *netlist);
+
+void map_bram_to_mem_hardblocks(block_memory_t *bram, netlist_t *netlist);
+void map_rom_to_mem_hardblocks(block_memory_t *rom, netlist_t *netlist);
+
+static void create_r_single_port_ram(block_memory_t *rom, netlist_t *netlist);
+static void create_2r_dual_port_ram(block_memory_t *rom, netlist_t *netlist);
+static void create_nr_single_port_ram(block_memory_t *rom, netlist_t *netlist);
+static void create_rw_single_port_ram(block_memory_t *bram, netlist_t * /* netlist */);
+static void create_rw_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+static void create_r2w_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+static void create_2rw_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+static void create_2r2w_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+static void create_nrmw_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+static void create_2rw_multiplexed_dual_port_ram(block_memory_t *bram, netlist_t *netlist);
+
+static nnode_t *ymem_to_rom(nnode_t *node, uintptr_t traverse_mark_number);
+static nnode_t *ymem2_to_rom(nnode_t *node, uintptr_t traverse_mark_number);
+static nnode_t *ymem_to_bram(nnode_t *node, uintptr_t traverse_mark_number);
+static nnode_t *ymem2_to_bram(nnode_t *node, uintptr_t traverse_mark_number);
+
+static bool check_same_addrs(block_memory_t *bram);
+static void perform_optimization(block_memory_t *memory);
+static signal_list_t *split_cascade_port(signal_list_t *signalvar, signal_list_t *selectors, int desired_width, nnode_t *node, netlist_t *netlist);
+static void decode_out_port(signal_list_t *src, signal_list_t *outs, signal_list_t *selectors, nnode_t *node, netlist_t *netlist);
+
+static void cleanup_block_memory_old_node(nnode_t *old_node);
+
+static void free_block_memory_index(block_memory_hashtable to_free);
+static void free_block_memory(block_memory_t *to_free);
+
+/**
+ * (function: init_block_memory_index)
+ *
+ * @brief Initialises hashtables to lookup memories based on inputs and names.
+ */
+void init_block_memory_index()
+{
+    block_memories_info.block_memories = block_memory_hashtable();
+    block_memories_info.read_only_memories = block_memory_hashtable();
+}
+
+/**
+ * (function: init_block_memory)
+ *
+ * @brief: initialize bram signals
+ *
+ * @param node pointing to a bram node
+ * @param netlist pointer to the current netlist file
+ */
+static block_memory_t *init_block_memory(nnode_t *node, netlist_t * /* netlist */)
+{
+    block_memory_t *bram = (block_memory_t *)vtr::malloc(sizeof(block_memory_t));
+
+    /**
+     * BRAM information
+     *
+     * CLK:        input port [0]
+     * RD_ADDR:    input port [1]
+     * RD_DATA:    output port[0]
+     * RD_ENABLE:  input port [2]
+     * WR_ADDR:    input port [3]
+     * WR_DATA:    input port [4]
+     * WR_ENABLE:  input port [5]
+     */
+
+    int CLK_width = node->input_port_sizes[0];
+    int RD_ADDR_width = node->input_port_sizes[1];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[2];
+    int WR_ADDR_width = node->input_port_sizes[3];
+    int WR_DATA_width = node->input_port_sizes[4];
+    int WR_ENABLE_width = node->input_port_sizes[5];
+
+    oassert(CLK_width == 1);
+
+    /* INPUT */
+
+    /* CLK */
+    int CLK_offset = 0;
+    bram->clk = init_signal_list();
+    add_pin_to_signal_list(bram->clk, node->input_pins[CLK_offset]);
+
+    /* read address pins */
+    int RD_ADDR_offset = CLK_offset + CLK_width;
+    bram->read_addr = init_signal_list();
+    for (int i = 0; i < RD_ADDR_width; ++i) {
+        add_pin_to_signal_list(bram->read_addr, node->input_pins[RD_ADDR_offset + i]);
+    }
+
+    /* read enable pins */
+    int RD_ENABLE_offset = RD_ADDR_offset + RD_ADDR_width;
+    bram->read_en = init_signal_list();
+    for (int i = 0; i < RD_ENABLE_width; ++i) {
+        add_pin_to_signal_list(bram->read_en, node->input_pins[RD_ENABLE_offset + i]);
+    }
+
+    /* write addr pins */
+    int WR_ADDR_offset = RD_ENABLE_offset + RD_ENABLE_width;
+    bram->write_addr = init_signal_list();
+    for (int i = 0; i < WR_ADDR_width; ++i) {
+        add_pin_to_signal_list(bram->write_addr, node->input_pins[WR_ADDR_offset + i]);
+    }
+
+    /* write data pins */
+    int WR_DATA_offset = WR_ADDR_offset + WR_ADDR_width;
+    bram->write_data = init_signal_list();
+    for (int i = 0; i < WR_DATA_width; ++i) {
+        add_pin_to_signal_list(bram->write_data, node->input_pins[WR_DATA_offset + i]);
+    }
+
+    /* write enable clk pins */
+    int WR_ENABLE_offset = WR_DATA_offset + WR_DATA_width;
+    bram->write_en = init_signal_list();
+    for (int i = 0; i < WR_ENABLE_width; ++i) {
+        add_pin_to_signal_list(bram->write_en, node->input_pins[WR_ENABLE_offset + i]);
+    }
+
+    /* OUTPUT */
+    /* read clk pins */
+    int RD_DATA_offset = 0;
+    bram->read_data = init_signal_list();
+    for (int i = 0; i < RD_DATA_width; ++i) {
+        add_pin_to_signal_list(bram->read_data, node->output_pins[RD_DATA_offset + i]);
+    }
+
+    /* creating new node since we need to reorder some input port for each inferenece mode */
+    bram->node = node;
+
+    /* keep track of location since we need for further node creation */
+    bram->loc = node->loc;
+
+    bram->memory_id = vtr::strdup(node->attributes->memory_id);
+
+    /* creating a unique name for the block memory */
+    bram->name = make_full_ref_name(bram->node->name, NULL, NULL, bram->memory_id, -1);
+    block_memories_info.block_memories.emplace(bram->name, bram);
+
+    return (bram);
+}
+
+/**
+ * (function: init_block_memory)
+ *
+ * @brief: initialize rom signals
+ *
+ * @param node pointing to a rom node
+ * @param netlist pointer to the current netlist file
+ */
+static block_memory_t *init_read_only_memory(nnode_t *node, netlist_t *netlist)
+{
+    block_memory_t *rom = (block_memory_t *)vtr::malloc(sizeof(block_memory_t));
+
+    /**
+     * ROM information
+     *
+     * CLK:        input port [0]
+     * RD_ADDR:    input port [1]
+     * RD_DATA:    output port [0]
+     * RD_ENABLE:  input port [2]
+     */
+
+    int CLK_width = node->input_port_sizes[0];
+    int RD_ADDR_width = node->input_port_sizes[1];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[2];
+    int WR_DATA_width = node->output_port_sizes[0];
+
+    oassert(CLK_width == 1);
+
+    /* INPUT */
+    /* CLK */
+    int CLK_offset = 0;
+    rom->clk = init_signal_list();
+    add_pin_to_signal_list(rom->clk, node->input_pins[CLK_offset]);
+
+    /* read address pins */
+    int RD_ADDR_offset = CLK_offset + CLK_width;
+    rom->read_addr = init_signal_list();
+    for (int i = 0; i < RD_ADDR_width; ++i) {
+        add_pin_to_signal_list(rom->read_addr, node->input_pins[RD_ADDR_offset + i]);
+    }
+
+    /* read enable pins */
+    int RD_ENABLE_offset = RD_ADDR_offset + RD_ADDR_width;
+    rom->read_en = init_signal_list();
+    for (int i = 0; i < RD_ENABLE_width; ++i) {
+        add_pin_to_signal_list(rom->read_en, node->input_pins[RD_ENABLE_offset + i]);
+    }
+
+    /* OUTPUT */
+    int RD_DATA_offset = 0;
+    rom->read_data = init_signal_list();
+    for (int i = 0; i < RD_DATA_width; ++i) {
+        add_pin_to_signal_list(rom->read_data, node->output_pins[RD_DATA_offset + i]);
+    }
+
+    /* PAD DATA IN */
+    /* we pad the data_in port for rom using pad pins */
+    rom->write_data = init_signal_list();
+    for (int i = 0; i < WR_DATA_width; ++i) {
+        add_pin_to_signal_list(rom->write_data, get_pad_pin(netlist));
+    }
+
+    /* no need to these variables in rom */
+    rom->write_addr = NULL;
+    rom->write_en = NULL;
+
+    /* creating new node since we need to reorder some input port for each inferenece mode */
+    rom->node = node;
+
+    /* keep track of location since we need for further node creation */
+    rom->loc = node->loc;
+
+    rom->memory_id = vtr::strdup(node->attributes->memory_id);
+
+    /* creating a unique name for the block memory */
+    rom->name = make_full_ref_name(rom->node->name, NULL, NULL, rom->memory_id, -1);
+    block_memories_info.read_only_memories.emplace(rom->name, rom);
+
+    return (rom);
+}
+
+/**
+ * (function: create_r_single_port_ram)
+ *
+ * @brief read_only_memory will be considered as single port ram
+ * this function reorders inputs and add data_in input to new node
+ * which is connected to pad node
+ *
+ * @param rom pointing to a rom node node
+ * @param netlist pointer to the current netlist file
+ */
+static void create_r_single_port_ram(block_memory_t *rom, netlist_t *netlist)
+{
+    nnode_t *old_node = rom->node;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 1);
+    oassert(num_wr_ports == 0);
+
+    /* single port ram signals */
+    sp_ram_signals *signals = (sp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+
+    /* INPUTS */
+    /* adding the read addr input port as address1 */
+    signals->addr = rom->read_addr;
+
+    /* handle clk signal */
+    signals->clk = rom->clk->pins[0];
+
+    /**
+     * we pad the data_in port using pad pins
+     * rom->write_data is already filled with pad pins
+     */
+    signals->data = rom->write_data;
+
+    /* there is no write data to set any we, so it will be connected to GND */
+    signals->we = get_zero_pin(netlist);
+    delete_npin(rom->read_en->pins[0]);
+
+    /* OUTPUT */
+    /* adding the read data port as output1 */
+    signals->out = rom->read_data;
+
+    create_single_port_ram(signals, old_node);
+
+    // CLEAN UP rom src node
+    cleanup_block_memory_old_node(old_node);
+    vtr::free(signals);
+}
+
+/**
+ * (function: create_2r_dual_port_ram)
+ *
+ * @brief in this case one write port MUST have the
+ * same address of the single read port.
+ * Will be instantiated into a DPRAM.
+ *
+ * @param bram pointer to the block memory
+ * @param netlist pointer to the current netlist file
+ */
+static void create_2r_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 2);
+    oassert(num_wr_ports == 0);
+    oassert(bram->read_addr->count == 2 * addr_width);
+    oassert(bram->read_data->count == 2 * data_width);
+
+    /* create a list of dpram ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    /* split read addr and add the first half to the addr1 */
+    signals->addr1 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(signals->addr1, bram->read_addr->pins[i]);
+    }
+
+    /* add pad pins as data1 */
+    signals->data1 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data1, get_pad_pin(netlist));
+    }
+
+    /* there is no write data to set any we, so it will be connected to GND */
+    signals->we1 = get_zero_pin(netlist);
+    delete_npin(bram->read_en->pins[0]);
+
+    /* add clk signal as dpram clk signal */
+    signals->clk = bram->clk->pins[0];
+
+    /* split read data and add the first half to the out1 */
+    signals->out1 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->out1, bram->read_data->pins[i]);
+    }
+
+    /* add the second half of the read addr to addr2 */
+    int addr_2_offset = addr_width;
+    signals->addr2 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(signals->addr2, bram->read_addr->pins[addr_2_offset + i]);
+    }
+
+    /* there is no write data to set any we, so it will be connected to GND */
+    signals->we2 = get_zero_pin(netlist);
+    delete_npin(bram->read_en->pins[1]);
+
+    /* add the second half of the write data to data2 */
+    signals->data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data2, get_pad_pin(netlist));
+    }
+
+    /* add the second half of the read data to out2 */
+    int out_2_offset = data_width;
+    signals->out2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->out2, bram->read_data->pins[out_2_offset + i]);
+    }
+
+    /* create a new dual port ram */
+    create_dual_port_ram(signals, old_node);
+
+    // CLEAN UP
+    cleanup_block_memory_old_node(old_node);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: create_nr_single_port_ram)
+ *
+ * @brief multiple read ports are multiplexed using read enable.
+ * Then, the rom will be mapped to a SPRAM
+ *
+ * @param rom pointing to a rom node node
+ * @param netlist pointer to the current netlist file
+ */
+static void create_nr_single_port_ram(block_memory_t *rom, netlist_t *netlist)
+{
+    nnode_t *old_node = rom->node;
+    int data_width = rom->node->attributes->DBITS;
+    int addr_width = rom->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* validation */
+    oassert(num_rd_ports > 1);
+    oassert(num_wr_ports == 0);
+
+    /* single port ram signals */
+    sp_ram_signals *signals = (sp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+    signal_list_t *selectors = NULL;
+
+    /* INPUTS */
+    selectors = copy_input_signals(rom->read_en);
+    /* adding the muxed read addrs as spram address */
+    signals->addr = split_cascade_port(rom->read_addr, selectors, addr_width, old_node, netlist);
+
+    /* add clk singnal */
+    signals->clk = rom->clk->pins[0];
+
+    /**
+     * rom->write_data is already initialized with pad pins in rom init
+     */
+    signals->data = copy_input_signals(rom->write_data);
+
+    /* there is no write data to set any we, so it will be connected to GND */
+    signals->we = get_zero_pin(netlist);
+
+    /* OUTPUT */
+    /* leave it empty, so create_single port ram function create a new pins */
+    signals->out = NULL;
+
+    /* create a SPRAM */
+    nnode_t *spram = create_single_port_ram(signals, old_node);
+
+    signal_list_t *spram_outputs = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(spram_outputs, spram->output_pins[i]);
+    }
+
+    /* decode the spram outputs to the n rom output ports */
+    decode_out_port(spram_outputs, rom->read_data, rom->read_en, old_node, netlist);
+
+    // CLEAN UP rom src node
+    free_signal_list(selectors);
+    free_signal_list(spram_outputs);
+
+    free_sp_ram_signals(signals);
+    cleanup_block_memory_old_node(old_node);
+}
+
+/**
+ * (function: create_rw_single_port_ram)
+ *
+ * @brief creates a single port ram for a block memory
+ * with one read and one write port that has the same
+ * addr for both read and write
+ *
+ * @param bram pointing to a bram node node
+ * @param netlist pointer to the current netlist file
+ */
+static void create_rw_single_port_ram(block_memory_t *bram, netlist_t * /* netlist */)
+{
+    nnode_t *old_node = bram->node;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 1);
+    oassert(num_wr_ports == 1);
+
+    /* single port ram signals */
+    sp_ram_signals *signals = (sp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+
+    /* the wr addr will be deleted since we do not need it anymore */
+    for (int i = 0; i < bram->write_addr->count; ++i) {
+        npin_t *wr_addr_pin = bram->write_addr->pins[i];
+        /* delete pin */
+        delete_npin(wr_addr_pin);
+    }
+
+    /* INPUTS */
+    /* adding the read addr input port as address1 */
+    signals->addr = bram->read_addr;
+
+    /* handling clock signals */
+    signals->clk = bram->clk->pins[0];
+
+    /**
+     * we pad the data_in port using pad pins
+     * bram->write_data is already filled with pad pins
+     */
+    signals->data = bram->write_data;
+
+    /* the rd enables will be deleted since we do not need it anymore */
+    for (int i = 0; i < bram->read_en->count; ++i) {
+        npin_t *rd_en_pin = bram->read_en->pins[i];
+        /* delete pin */
+        delete_npin(rd_en_pin);
+    }
+
+    /* merge all enables */
+    if (bram->write_en->count > 1) {
+        /* need to OR all write enable since we1 should be one bit in single port ram */
+        // bram->write_en = make_chain(LOGICAL_OR, bram->write_en, old_node);
+        for (int i = 1; i < bram->write_en->count; ++i) {
+            delete_npin(bram->write_en->pins[i]);
+        }
+    }
+    signals->we = bram->write_en->pins[0];
+
+    /* OUTPUT */
+    /* adding the read data port as output1 */
+    signals->out = bram->read_data;
+
+    create_single_port_ram(signals, old_node);
+
+    // CLEAN UP bram src node
+    cleanup_block_memory_old_node(old_node);
+    vtr::free(signals);
+}
+
+/**
+ * (function: create_rw_dual_port_ram)
+ *
+ * @brief block_ram will be considered as dual port ram
+ * this function reorders inputs and hook pad pins into datain2
+ * it also leaves the second output of the dual port ram unconnected
+ *
+ * @param bram pointing to a block memory
+ * @param netlist pointer to the current netlist file
+ */
+static void create_rw_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 1);
+    oassert(num_wr_ports == 1);
+
+    /* dual port ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+
+    /* INPUTS */
+    signals->addr1 = bram->read_addr;
+    signals->addr2 = bram->write_addr;
+
+    /* adding the write addr port as address2 */
+    signals->clk = bram->clk->pins[0];
+
+    /* adding the write data port as data1 */
+    signals->data2 = bram->write_data;
+
+    /* we pad the second data port using pad pins */
+    signal_list_t *pad_signals = init_signal_list();
+    for (int i = 0; i < bram->write_data->count; ++i) {
+        add_pin_to_signal_list(pad_signals, get_pad_pin(netlist));
+    }
+    signals->data1 = pad_signals;
+
+    for (int i = 0; i < bram->read_en->count; ++i) {
+        /* delete all read enable pins, since no need to write from addr1 */
+        delete_npin(bram->read_en->pins[0]);
+    }
+    signals->we1 = get_zero_pin(netlist);
+
+    /* adding wr_en as we of port 2 */
+    signals->we2 = bram->write_en->pins[0];
+
+    /* OUTPUT */
+    /* adding the read data port as output1 */
+    signals->out1 = bram->read_data;
+
+    /* leave second output port unconnected */
+    int offset = bram->read_data->count;
+    signal_list_t *out2_signals = init_signal_list();
+    for (int i = 0; i < bram->read_data->count; i++) {
+        // specify the output pin
+        npin_t *new_pin1 = allocate_npin();
+        npin_t *new_pin2 = allocate_npin();
+        nnet_t *new_net = allocate_nnet();
+        new_net->name = make_full_ref_name(NULL, NULL, NULL, bram->name, offset + i);
+        /* hook up new pin 1 into the new net */
+        add_driver_pin_to_net(new_net, new_pin1);
+        /* hook up the new pin 2 to this new net */
+        add_fanout_pin_to_net(new_net, new_pin2);
+
+        /* adding to signal list */
+        add_pin_to_signal_list(out2_signals, new_pin1);
+    }
+    signals->out2 = out2_signals;
+
+    create_dual_port_ram(signals, old_node);
+
+    // CLEAN UP
+    cleanup_block_memory_old_node(old_node);
+    free_signal_list(pad_signals);
+    free_signal_list(out2_signals);
+    vtr::free(signals);
+}
+
+/**
+ * (function: create_r2w_dual_port_ram)
+ *
+ * @brief in this case one write port MUST have the
+ * same address of the single read port.
+ * Will be instantiated into a DPRAM.
+ *
+ * @param bram pointer to the block memory
+ * @param netlist pointer to the current netlist file
+ */
+static void create_r2w_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 1);
+    oassert(num_wr_ports == 2);
+
+    /* create a list of dpram ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    /* add read address as addr1 to dpram signal lists */
+    signals->addr1 = init_signal_list();
+    for (int i = 0; i < bram->read_addr->count; ++i) {
+        add_pin_to_signal_list(signals->addr1, bram->read_addr->pins[i]);
+    }
+
+    /* split wr_addr, wr_data and wr_en ports */
+    int addr_2_offset = addr_width;
+    signal_list_t *wr_addr1 = init_signal_list();
+    signal_list_t *wr_addr2 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(wr_addr1, bram->write_addr->pins[i]);
+        add_pin_to_signal_list(wr_addr2, bram->write_addr->pins[addr_2_offset + i]);
+    }
+
+    oassert(bram->write_en->count == 2);
+    npin_t *wr_en1 = bram->write_en->pins[0];
+    npin_t *wr_en2 = bram->write_en->pins[1];
+
+    int wr_data_2_offset = data_width;
+    signal_list_t *wr_data1 = init_signal_list();
+    signal_list_t *wr_data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(wr_data1, bram->write_data->pins[i]);
+        add_pin_to_signal_list(wr_data2, bram->write_data->pins[wr_data_2_offset + i]);
+    }
+
+    /**
+     * [NOTE]:
+     * Odin-II handle memory block with more than two distint
+     * address ports by muxing read/write ports based on en
+     */
+    bool first_match = sigcmp(wr_addr1, bram->read_addr);
+    bool second_match = sigcmp(wr_addr2, bram->read_addr);
+
+    if (!first_match && !second_match) {
+        first_match = sigcmp(bram->read_addr, wr_addr1);
+        second_match = sigcmp(bram->read_addr, wr_addr2);
+        if (!first_match && !second_match) {
+            // CLEAN UP
+            free_signal_list(wr_addr1);
+            free_signal_list(wr_addr2);
+            free_signal_list(wr_data1);
+            free_signal_list(wr_data2);
+
+            /* all ports have different address */
+            create_nrmw_dual_port_ram(bram, netlist);
+            return;
+        }
+    }
+
+    /**
+     * one write address is always equal to read addr.
+     * As a result, the corresponding write data will be mapped to data1
+     */
+    signals->data1 = (first_match) ? wr_data1 : wr_data2;
+
+    /* free read en since we do not need in DPRAM model */
+    delete_npin(bram->read_en->pins[0]);
+    /* add write enable signals for matched wr port as we1 */
+    signals->we1 = (first_match) ? wr_en1 : wr_en2;
+
+    /* add clk signal as dpram clk signal */
+    signals->clk = bram->clk->pins[0];
+
+    /* map read data to the out1 */
+    signals->out1 = init_signal_list();
+    for (int i = 0; i < bram->read_data->count; ++i) {
+        add_pin_to_signal_list(signals->out1, bram->read_data->pins[i]);
+    }
+
+    /* OTHER WR RELATED PORTS */
+    /* addr2 for another write addr */
+    signals->addr2 = (first_match) ? wr_addr2 : wr_addr1;
+
+    /* add write enable signals for second wr port as we2 */
+    signals->we2 = (first_match) ? wr_en2 : wr_en1;
+
+    /* the rest of write data pin is for data2 */
+    signals->data2 = (first_match) ? wr_data2 : wr_data1;
+
+    /* out2 will be unconnected */
+    signals->out2 = init_signal_list();
+    for (int i = 0; i < signals->out1->count; ++i) {
+        /* create the clk node's output pin */
+        npin_t *new_pin1 = allocate_npin();
+        npin_t *new_pin2 = allocate_npin();
+        nnet_t *new_net = allocate_nnet();
+        /* hook up new pin 1 into the new net */
+        add_driver_pin_to_net(new_net, new_pin1);
+        /* hook up the new pin 2 to this new net */
+        add_fanout_pin_to_net(new_net, new_pin2);
+
+        /* hook the output pin into the node */
+        add_pin_to_signal_list(signals->out2, new_pin1);
+    }
+
+    /* create a new dual port ram */
+    create_dual_port_ram(signals, old_node);
+
+    // CLEAN UP
+    /* free matched wr addr pins since they are as the same as read addr */
+    for (int i = 0; i < addr_width; ++i) {
+        npin_t *pin = (first_match) ? wr_addr1->pins[i] : wr_addr2->pins[i];
+        /* delete pin */
+        delete_npin(pin);
+    }
+    free_signal_list((first_match) ? wr_addr1 : wr_addr2);
+
+    cleanup_block_memory_old_node(old_node);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: create_2rw_dual_port_ram)
+ *
+ * @brief creates a dual port ram. The given bram should have
+ * a pair of read addr and write addr with the same addrs
+ *
+ * @param bram pointer to the block memory
+ * @param netlist pointer to the current netlist file
+ */
+static void create_2rw_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 2);
+    oassert(num_wr_ports == 1);
+
+    /* create a list of dpram ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    /* add write address as addr1 to dpram signal lists */
+    signals->addr1 = init_signal_list();
+    for (int i = 0; i < bram->write_addr->count; ++i) {
+        add_pin_to_signal_list(signals->addr1, bram->write_addr->pins[i]);
+    }
+
+    /**
+     * one write address is always equal to read addr.
+     * As a result, the corresponding write data will be mapped to data1
+     */
+    signals->data1 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data1, bram->write_data->pins[i]);
+    }
+
+    oassert(bram->write_en->count == 1);
+    /* add write enable signals for matched wr port as we1 */
+    signals->we1 = bram->write_en->pins[0];
+
+    /* split rd_addr, rd_data ports */
+    int rd_addr_2_offset = addr_width;
+    signal_list_t *rd_addr1 = init_signal_list();
+    signal_list_t *rd_addr2 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(rd_addr1, bram->read_addr->pins[i]);
+        add_pin_to_signal_list(rd_addr2, bram->read_addr->pins[rd_addr_2_offset + i]);
+    }
+
+    int rd_data_2_offset = data_width;
+    signal_list_t *rd_data1 = init_signal_list();
+    signal_list_t *rd_data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(rd_data1, bram->read_data->pins[i]);
+        add_pin_to_signal_list(rd_data2, bram->read_data->pins[rd_data_2_offset + i]);
+    }
+
+    /**
+     * [NOTE]:
+     * Odin-II handle memory block with more than two distint
+     * address ports using muxed read/write ports
+     */
+    bool first_match = sigcmp(bram->write_addr, rd_addr1);
+    bool second_match = sigcmp(bram->write_addr, rd_addr2);
+
+    if (!first_match && !second_match) {
+        first_match = sigcmp(rd_addr1, bram->write_addr);
+        second_match = sigcmp(rd_addr2, bram->write_addr);
+        if (!first_match && !second_match) {
+            // CLEAN UP
+            free_signal_list(rd_addr1);
+            free_signal_list(rd_addr2);
+            free_signal_list(rd_data1);
+            free_signal_list(rd_data2);
+
+            /* all ports have different address */
+            create_2rw_multiplexed_dual_port_ram(bram, netlist);
+            return;
+        }
+    }
+
+    /* delete rd pins since we use corresponding wr_en and zero*/
+    for (int i = 0; i < bram->read_en->count; ++i) {
+        delete_npin(bram->read_en->pins[i]);
+    }
+
+    /* map matched read data to the out1 */
+    signals->out1 = (first_match) ? rd_data1 : rd_data2;
+
+    /* add merged clk signal as dpram clk signal */
+    signals->clk = bram->clk->pins[0];
+
+    /* OTHER WR RELATED PORTS */
+    /* addr2 for another write addr */
+    signals->addr2 = (first_match) ? rd_addr2 : rd_addr1;
+
+    /* en 2 should always be 0 since there is no second write port */
+    signals->we2 = get_zero_pin(netlist);
+
+    /* the rest of write data pin is for data2 */
+    signals->data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data2, get_pad_pin(netlist));
+    }
+
+    /* out2 map to other read data */
+    signals->out2 = (first_match) ? rd_data2 : rd_data1;
+
+    /* create a new dual port ram */
+    create_dual_port_ram(signals, old_node);
+
+    // CLEAN UP
+    /* free matched rd addr pins since they are as the same as write addr */
+    for (int i = 0; i < addr_width; ++i) {
+        npin_t *pin = (first_match) ? rd_addr1->pins[i] : rd_addr2->pins[i];
+        /* delete pin */
+        delete_npin(pin);
+    }
+    free_signal_list((first_match) ? rd_addr1 : rd_addr2);
+
+    cleanup_block_memory_old_node(old_node);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: create_2r2w_dual_port_ram)
+ *
+ * @brief creates a dual port ram. The given bram should have
+ * two pairs of read addr and write addr with the same addrs
+ *
+ * @param bram pointer to the block memory
+ * @param netlist pointer to the current netlist file
+ */
+static void create_2r2w_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 2);
+    oassert(num_wr_ports == 2);
+    oassert(bram->read_addr->count == 2 * addr_width);
+    oassert(bram->read_data->count == 2 * data_width);
+
+    /* split wr_addr, wr_data and wr_en ports */
+    int wr_addr_2_base = addr_width;
+    signal_list_t *wr_addr1 = init_signal_list();
+    signal_list_t *wr_addr2 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(wr_addr1, bram->write_addr->pins[i]);
+        add_pin_to_signal_list(wr_addr2, bram->write_addr->pins[wr_addr_2_base + i]);
+    }
+
+    oassert(bram->write_en->count == 2);
+    npin_t *wr_en1 = bram->write_en->pins[0];
+    npin_t *wr_en2 = bram->write_en->pins[1];
+
+    int wr_data_2_base = data_width;
+    signal_list_t *wr_data1 = init_signal_list();
+    signal_list_t *wr_data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(wr_data1, bram->write_data->pins[i]);
+        add_pin_to_signal_list(wr_data2, bram->write_data->pins[wr_data_2_base + i]);
+    }
+
+    /* split rd_addr, rd_data ports */
+    int rd_addr_2_base = addr_width;
+    signal_list_t *rd_addr1 = init_signal_list();
+    signal_list_t *rd_addr2 = init_signal_list();
+    for (int i = 0; i < addr_width; ++i) {
+        add_pin_to_signal_list(rd_addr1, bram->read_addr->pins[i]);
+        add_pin_to_signal_list(rd_addr2, bram->read_addr->pins[rd_addr_2_base + i]);
+    }
+
+    int rd_data_2_base = data_width;
+    signal_list_t *rd_data1 = init_signal_list();
+    signal_list_t *rd_data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(rd_data1, bram->read_data->pins[i]);
+        add_pin_to_signal_list(rd_data2, bram->read_data->pins[rd_data_2_base + i]);
+    }
+
+    /**
+     * [NOTE]:
+     * Odin-II handle memory block with more than two distint
+     * address ports using muxed read/write ports
+     */
+    bool first_match_read1 = sigcmp(wr_addr1, rd_addr1);
+    bool second_match_read1 = sigcmp(wr_addr2, rd_addr1);
+    if (!first_match_read1 && !second_match_read1) {
+        first_match_read1 = sigcmp(rd_addr1, wr_addr1);
+        second_match_read1 = sigcmp(rd_addr1, wr_addr2);
+    }
+
+    if (!first_match_read1 && !second_match_read1) {
+        // CLEAN UP
+        free_signal_list(wr_addr1);
+        free_signal_list(wr_addr2);
+        free_signal_list(wr_data1);
+        free_signal_list(wr_data2);
+        free_signal_list(rd_addr1);
+        free_signal_list(rd_addr2);
+        free_signal_list(rd_data1);
+        free_signal_list(rd_data2);
+
+        /* all ports have different address */
+        create_nrmw_dual_port_ram(bram, netlist);
+        return;
+    }
+
+    /* delete rd pins since we use corresponding wr_en and zero*/
+    for (int i = 0; i < bram->read_en->count; ++i) {
+        delete_npin(bram->read_en->pins[i]);
+    }
+
+    /* create a list of dpram ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    /* hook the first half os splitted addr into addr1 */
+    signals->addr1 = rd_addr1;
+
+    /* hook the second half os splitted addr into addr2 */
+    signals->addr2 = rd_addr2;
+
+    /* split read data and add the first half to the out1 */
+    signals->out1 = rd_data1;
+
+    /* add the second half of the read data to out2 */
+    signals->out2 = rd_data2;
+
+    /* split write data and add the first half to the data1 */
+    signals->data1 = (first_match_read1) ? wr_data1 : wr_data2;
+
+    /* split write data and add the second half to the data2 */
+    signals->data2 = (first_match_read1) ? wr_data2 : wr_data1;
+
+    /* add write enable signals for first wr port as we1 */
+    signals->we1 = wr_en1;
+
+    /* add clk signal as dpram clk signal */
+    signals->clk = bram->clk->pins[0];
+
+    /* add write enable signals for the second wr port as we2 */
+    signals->we2 = wr_en2;
+
+    /* create a new dual port ram */
+    create_dual_port_ram(signals, old_node);
+
+    // CLEAN UP
+    /* at this point wr_addr and rd_addr must be the same */
+    oassert(bram->read_addr->count == bram->write_addr->count);
+    /* the wr addr will be deleted since we do not need it anymore */
+    for (int i = 0; i < bram->write_addr->count; ++i) {
+        npin_t *wr_addr_pin = bram->write_addr->pins[i];
+        /* delete pin */
+        delete_npin(wr_addr_pin);
+    }
+    free_signal_list(wr_addr1);
+    free_signal_list(wr_addr2);
+
+    cleanup_block_memory_old_node(old_node);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: create_nrmw_dual_port_ram)
+ *
+ * @brief multiple read ports are multiplexed using read enable.
+ * multiple write ports are multiplexed using write enable.
+ * Then, the BRAM will be mapped to a DPRAM
+ *
+ * @param bram pointing to a bram node node
+ * @param netlist pointer to the current netlist file
+ */
+static void create_nrmw_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+
+    /* should have been resovled before this function */
+    oassert(num_rd_ports > 1);
+    oassert(num_wr_ports > 1);
+
+    /* dual port ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+    signal_list_t *selectors = NULL;
+
+    /* INPUTS */
+    selectors = copy_input_signals(bram->read_en);
+    /* adding the read addr input port as address1 */
+    signals->addr1 = split_cascade_port(bram->read_addr, selectors, addr_width, old_node, netlist);
+    free_signal_list(selectors);
+
+    selectors = copy_input_signals(bram->write_en);
+    /* adding the write addr port as address2 */
+    signals->addr2 = split_cascade_port(bram->write_addr, selectors, addr_width, old_node, netlist);
+    free_signal_list(selectors);
+
+    /* handling clock signals */
+    signals->clk = bram->clk->pins[0];
+
+    /* we pad the first data port using pad pins */
+    signals->data1 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data1, get_pad_pin(netlist));
+    }
+    selectors = copy_input_signals(bram->write_en);
+    /* adding the write data port as data2 */
+    signals->data2 = split_cascade_port(bram->write_data, selectors, data_width, old_node, netlist);
+    free_signal_list(selectors);
+
+    /* first port does not have data, so the enable is GND */
+    signals->we1 = get_zero_pin(netlist);
+
+    /* create vcc signas as the value of we2 when the write_en pins are active */
+    signal_list_t *vcc_signals = init_signal_list();
+    for (int i = 0; i < num_wr_ports; ++i) {
+        add_pin_to_signal_list(vcc_signals, get_one_pin(netlist));
+    }
+    signal_list_t *we2_signal = split_cascade_port(vcc_signals, bram->write_en, 1, old_node, netlist);
+
+    signals->we2 = we2_signal->pins[0];
+
+    /* OUTPUT */
+    /* leaving out1 of dpram null, so it will create a new pins */
+    signals->out1 = NULL;
+    signals->out2 = NULL;
+
+    /* create a DPRAM node */
+    nnode_t *dpram = create_dual_port_ram(signals, old_node);
+
+    signal_list_t *dpram_outputs = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(dpram_outputs, dpram->output_pins[i]);
+    }
+
+    /* decode the spram outputs to the n bram output ports */
+    decode_out_port(dpram_outputs, bram->read_data, bram->read_en, old_node, netlist);
+
+    // CLEAN UP
+    cleanup_block_memory_old_node(old_node);
+    free_signal_list(dpram_outputs);
+    free_signal_list(we2_signal);
+    free_signal_list(vcc_signals);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: create_2rw_multiplexed_dual_port_ram)
+ *
+ * @brief read ports are multiplexed using read enable.
+ * Then, the BRAM will be mapped to a DPRAM
+ *
+ * @param bram pointing to a bram node node
+ * @param netlist pointer to the current netlist file
+ */
+static void create_2rw_multiplexed_dual_port_ram(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *old_node = bram->node;
+    int data_width = bram->node->attributes->DBITS;
+    int addr_width = bram->node->attributes->ABITS;
+    int num_rd_ports = old_node->attributes->RD_PORTS;
+    int num_wr_ports = old_node->attributes->WR_PORTS;
+    /* should have been resovled before this function */
+    oassert(num_rd_ports == 2);
+    oassert(num_wr_ports == 1);
+    /* dual port ram signals */
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::calloc(1, sizeof(dp_ram_signals));
+    signal_list_t *selectors = NULL;
+    /* INPUTS */
+    selectors = copy_input_signals(bram->read_en);
+    /* adding the read addr input port as address1 */
+    signals->addr1 = split_cascade_port(bram->read_addr, selectors, addr_width, old_node, netlist);
+    free_signal_list(selectors);
+    signals->addr2 = init_signal_list();
+    for (int i = 0; i < bram->write_addr->count; ++i) {
+        add_pin_to_signal_list(signals->addr2, bram->write_addr->pins[i]);
+    }
+
+    /* handling clock signals */
+    signals->clk = bram->clk->pins[0];
+    /* we pad the first data port using pad pins */
+    signals->data1 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data1, get_pad_pin(netlist));
+    }
+    signals->data2 = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(signals->data2, bram->write_data->pins[i]);
+    }
+    /* first port does not have data, so the enable is GND */
+    signals->we1 = get_zero_pin(netlist);
+    signals->we2 = bram->write_en->pins[0];
+    /* OUTPUT */
+    /* leaving out1 of dpram null, so it will create a new pins */
+    signals->out1 = NULL;
+    signals->out2 = NULL;
+    /* create a DPRAM node */
+    nnode_t *dpram = create_dual_port_ram(signals, old_node);
+    signal_list_t *dpram_outputs = init_signal_list();
+    for (int i = 0; i < data_width; ++i) {
+        add_pin_to_signal_list(dpram_outputs, dpram->output_pins[i]);
+    }
+    /* decode the spram outputs to the n bram output ports */
+    decode_out_port(dpram_outputs, bram->read_data, bram->read_en, old_node, netlist);
+    // CLEAN UP
+    cleanup_block_memory_old_node(old_node);
+    free_signal_list(dpram_outputs);
+    free_dp_ram_signals(signals);
+}
+
+/**
+ * (function: map_rom_to_mem_hardblocks)
+ *
+ * @brief mapping a read-only memory to single_port_ram or
+ * dual_port_ram according to the number and source of its ports
+ *
+ * @param rom pointer to the read only memory
+ * @param netlist pointer to the current netlist file
+ */
+void map_rom_to_mem_hardblocks(block_memory_t *rom, netlist_t *netlist)
+{
+    nnode_t *node = rom->node;
+
+    int width = node->attributes->DBITS;
+    int depth = shift_left_value_with_overflow_check(0X1, node->attributes->ABITS, rom->loc);
+
+    int rd_ports = node->attributes->RD_PORTS;
+    int wr_ports = node->attributes->WR_PORTS;
+
+    /* Read Only Memory validateion */
+    oassert(wr_ports == 0);
+
+    int rom_relative_area = depth * width;
+    t_model *lutram_model = find_hard_block(LUTRAM_string);
+
+    if (lutram_model != NULL && (LUTRAM_INFERENCE_THRESHOLD_MIN <= rom_relative_area) && (rom_relative_area <= LUTRAM_INFERENCE_THRESHOLD_MAX)) {
+        /* map to LUTRAM */
+        // nnode_t* lutram = NULL;
+        /* TODO */
+    } else {
+        /* need to split the rom from the data width */
+        if (rd_ports == 1) {
+            /* create the ROM and allocate ports according to the SPRAM hard block */
+            create_r_single_port_ram(rom, netlist);
+
+        } else if (rd_ports == 2) {
+            /* create the ROM and allocate ports according to the DPRAM hard block */
+            create_2r_dual_port_ram(rom, netlist);
+
+        } else {
+            /* more than 2 read ports wil be handle using multiplexed ports and a SPRAM */
+            create_nr_single_port_ram(rom, netlist);
+        }
+    }
+}
+
+/**
+ * (function: map_bram_to_mem_hardblocks)
+ *
+ * @brief mapping a block_memory (has both read and write access) to single_port_ram
+ * or dual_port_ram according to the number and source of its ports
+ *
+ * @param bram pointer to the block memory
+ * @param netlist pointer to the current netlist file
+ */
+void map_bram_to_mem_hardblocks(block_memory_t *bram, netlist_t *netlist)
+{
+    nnode_t *node = bram->node;
+
+    int width = node->attributes->DBITS;
+    int depth = shift_left_value_with_overflow_check(0X1, node->attributes->ABITS, bram->loc);
+
+    /* since the data1_w + data2_w == data_out for DPRAM */
+    long rd_ports = node->attributes->RD_PORTS;
+    long wr_ports = node->attributes->WR_PORTS;
+
+    /**
+     * Potential place for checking block ram if their relative
+     * size is less than a threshold, they could be mapped on LUTRAM
+     */
+
+    int bram_relative_area = depth * width;
+    t_model *lutram_model = find_hard_block(LUTRAM_string);
+
+    if (lutram_model != NULL && (LUTRAM_INFERENCE_THRESHOLD_MIN <= bram_relative_area) && (bram_relative_area <= LUTRAM_INFERENCE_THRESHOLD_MAX)) {
+        /* map to LUTRAM */
+        // nnode_t* lutram = NULL;
+        /* TODO */
+    } else {
+        if (wr_ports == (rd_ports == 1)) {
+            if (check_same_addrs(bram)) {
+                /* create a single port ram and allocate ports according to the SPRAM hard block */
+                create_rw_single_port_ram(bram, netlist);
+
+            } else {
+                /* create a dual port ram and allocate ports according to the DPRAM hard block */
+                create_rw_dual_port_ram(bram, netlist);
+            }
+
+        } else if (rd_ports == 1 && wr_ports == 2) {
+            /* create a dual port ram and allocate ports according to the DPRAM hard block */
+            create_r2w_dual_port_ram(bram, netlist);
+
+        } else if (rd_ports == 2 && wr_ports == 1) {
+            /* create a dual port ram and allocate ports according to the DPRAM hard block */
+            create_2rw_dual_port_ram(bram, netlist);
+
+        } else if (rd_ports == 2 && wr_ports == 2) {
+            /* create a dual port ram and allocate ports according to the DPRAM hard block */
+            create_2r2w_dual_port_ram(bram, netlist);
+
+        } else {
+            /* create a dual port ram and muxed all read together and all writes together */
+            create_nrmw_dual_port_ram(bram, netlist);
+        }
+    }
+}
+/**
+ * (function: resolve_bram_node)
+ *
+ * @brief create, verify and shrink the bram node
+ *
+ * @param node pointing to a bram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void resolve_bram_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    /* validate bram port sizes */
+    oassert(node->num_input_port_sizes == 6);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* initializing a new block ram */
+    block_memory_t *bram = init_block_memory(node, netlist);
+
+    /* perform optimization on memory inference */
+    perform_optimization(bram);
+
+    block_memories_info.block_memory_list = insert_in_vptr_list(block_memories_info.block_memory_list, bram);
+}
+
+/**
+ * (function: resolve_rom_node)
+ *
+ * @brief read_only_memory will be considered as single port ram
+ * this function reorders inputs and add data_in input to new node
+ * which is connected to pad node
+ *
+ * @param node pointing to a rom node node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void resolve_rom_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    /* validate port sizes */
+    oassert(node->num_input_port_sizes == 3);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* create the rom and allocate ports according to the DPRAM hard block */
+    block_memory_t *rom = init_read_only_memory(node, netlist);
+
+    /* perform optimization on memory inference */
+    perform_optimization(rom);
+
+    block_memories_info.read_only_memory_list = insert_in_vptr_list(block_memories_info.read_only_memory_list, rom);
+}
+
+/**
+ * (function: resolve_ymem_node)
+ *
+ * @brief change ymem to bram or rom
+ *
+ * @param node pointing to a bram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void resolve_ymem_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    nnode_t *transformed_mem = NULL;
+
+    /* check for BRAM */
+    if ((node->num_input_port_sizes == 7) && (node->num_output_port_sizes == 1)) {
+        /* create BRAM node */
+        transformed_mem = ymem_to_bram(node, traverse_mark_number);
+        /* resolve bram node */
+        resolve_bram_node(transformed_mem, traverse_mark_number, netlist);
+    }
+    /* check for ROM */
+    else if ((node->num_input_port_sizes == 3) && (node->num_output_port_sizes == 1)) {
+        /* create BRAM node */
+        transformed_mem = ymem_to_rom(node, traverse_mark_number);
+        /* resolve bram node */
+        resolve_rom_node(transformed_mem, traverse_mark_number, netlist);
+    }
+}
+
+void resolve_ymem2_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    nnode_t *transformed_mem = NULL;
+
+    /* check for BRAM */
+    if ((node->num_input_port_sizes == 9) && (node->num_output_port_sizes == 1)) {
+        /* create BRAM node */
+        transformed_mem = ymem2_to_bram(node, traverse_mark_number);
+        /* resolve bram node */
+        resolve_bram_node(transformed_mem, traverse_mark_number, netlist);
+    }
+    /* check for ROM */
+    else if ((node->num_input_port_sizes == 5) && (node->num_output_port_sizes == 1)) {
+        /* create BRAM node */
+        transformed_mem = ymem2_to_rom(node, traverse_mark_number);
+        /* resolve bram node */
+        resolve_rom_node(transformed_mem, traverse_mark_number, netlist);
+    }
+}
+
+/**
+ * (function: iterate_block_memories)
+ *
+ * @brief iterate over block memories to map them to DP/SPRAMs
+ *
+ * @param netlist pointer to the current netlist file
+ */
+void iterate_block_memories(netlist_t *netlist)
+{
+    t_linked_vptr *ptr = block_memories_info.block_memory_list;
+    while (ptr != NULL) {
+        block_memory_t *bram = (block_memory_t *)ptr->data_vptr;
+
+        /* validation */
+        oassert(bram != NULL);
+
+        map_bram_to_mem_hardblocks(bram, netlist);
+        ptr = ptr->next;
+    }
+
+    ptr = block_memories_info.read_only_memory_list;
+    while (ptr != NULL) {
+        block_memory_t *rom = (block_memory_t *)ptr->data_vptr;
+
+        /* validation */
+        oassert(rom != NULL);
+
+        map_rom_to_mem_hardblocks(rom, netlist);
+        ptr = ptr->next;
+    }
+}
+
+/**
+ * (function: check_same_addrs)
+ *
+ * @brief this function is to check if the read addr
+ *  and write addr is the same or not.
+ *
+ * @param bram pointer to the block memory
+ *
+ * @return read_addr == write_addr
+ */
+static bool check_same_addrs(block_memory_t *bram)
+{
+    int read_addr_width = bram->read_addr->count;
+    int write_addr_width = bram->write_addr->count;
+
+    /* first check is the width, if they have not the equal width return false */
+    if (read_addr_width != write_addr_width) {
+        return (false);
+    }
+    /* if they have the equal width, here is to check their driver */
+    else
+        return (sigcmp(bram->read_addr, bram->write_addr));
+}
+
+/**
+ * (function: ymem_to_rom)
+ *
+ * @brief map ymem to rom
+ *
+ * @param node pointing to a bram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ */
+static nnode_t *ymem_to_rom(nnode_t *node, uintptr_t traverse_mark_number)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    int offset, new_offset = 0;
+    int addr_width = node->attributes->ABITS;
+    int data_width = node->attributes->DBITS;
+    int num_rd_ports = node->attributes->RD_PORTS;
+
+    int RD_ADDR_width = node->input_port_sizes[0];
+    int RD_CLK_width = node->input_port_sizes[1];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[2];
+
+    /* check for BRAM */
+    oassert(node->num_input_port_sizes == 3);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* create BRAM node */
+    nnode_t *transformed_mem = allocate_nnode(node->loc);
+    transformed_mem->traverse_visited = traverse_mark_number;
+    transformed_mem->type = ROM;
+    copy_attribute(transformed_mem->attributes, node->attributes);
+    transformed_mem->name = node_name(transformed_mem, node->name);
+    transformed_mem->related_ast_node = node->related_ast_node;
+    /**
+     * ROM information
+     *
+     * RD_ADDR:    input port [0]  ==> CLK:        input port [0]
+     * RD_CLK:     input port [1]  ==> RD_ADDR:    input port [1]
+     * RD_DATA:    output port[0]  ==> RD_DATA:    output port [0]
+     * RD_ENABLE:  input port [2]  ==> RD_ENABLE:  input port [2]
+     */
+
+    /* CLK */
+    offset = RD_ADDR_width;
+    add_input_port_information(transformed_mem, 1);
+    allocate_more_input_pins(transformed_mem, 1);
+    for (int i = 0; i < RD_CLK_width; i++) {
+        if (i == 0) {
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, 0);
+        } else {
+            /* delete extra pins */
+            delete_npin(node->input_pins[i + offset]);
+        }
+    }
+    new_offset += 1;
+
+    /* RD_ADDR */
+    offset = 0;
+    oassert(RD_ADDR_width == num_rd_ports * addr_width);
+    add_input_port_information(transformed_mem, RD_ADDR_width);
+    allocate_more_input_pins(transformed_mem, RD_ADDR_width);
+    for (int i = 0; i < RD_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ADDR_width;
+
+    /* RD_ENABLE */
+    offset = RD_ADDR_width + RD_CLK_width;
+    oassert(RD_ENABLE_width == num_rd_ports);
+    add_input_port_information(transformed_mem, RD_ENABLE_width);
+    allocate_more_input_pins(transformed_mem, RD_ENABLE_width);
+    for (int i = 0; i < RD_ENABLE_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ENABLE_width;
+
+    /* RD_DATA */
+    offset = 0;
+    oassert(RD_DATA_width == num_rd_ports * data_width);
+    add_output_port_information(transformed_mem, RD_DATA_width);
+    allocate_more_output_pins(transformed_mem, RD_DATA_width);
+    for (int i = 0; i < RD_DATA_width; i++) {
+        remap_pin_to_new_node(node->output_pins[i + offset], transformed_mem, i);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return (transformed_mem);
+}
+static nnode_t *ymem2_to_rom(nnode_t *node, uintptr_t traverse_mark_number)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    int offset, new_offset = 0;
+    int addr_width = node->attributes->ABITS;
+    int data_width = node->attributes->DBITS;
+    int num_rd_ports = node->attributes->RD_PORTS;
+
+    int RD_ADDR_width = node->input_port_sizes[0];
+    int RD_ARST_width = node->input_port_sizes[1];
+    int RD_CLK_width = node->input_port_sizes[2];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[3];
+    int RD_SRST_width = node->input_port_sizes[4];
+
+    /* check for BRAM */
+    oassert(node->num_input_port_sizes == 5);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* create BRAM node */
+    nnode_t *transformed_mem = allocate_nnode(node->loc);
+    transformed_mem->traverse_visited = traverse_mark_number;
+    transformed_mem->type = ROM;
+    copy_attribute(transformed_mem->attributes, node->attributes);
+    transformed_mem->name = node_name(transformed_mem, node->name);
+    transformed_mem->related_ast_node = node->related_ast_node;
+
+    /* ARST */
+    offset = RD_ADDR_width;
+    for (int i = 0; i < RD_ARST_width; i++) {
+        delete_npin(node->input_pins[offset + i]);
+    }
+    /* SRST */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width;
+    for (int i = 0; i < RD_SRST_width; i++) {
+        delete_npin(node->input_pins[offset + i]);
+    }
+    /* CLK */
+    offset = RD_ADDR_width + RD_ARST_width;
+    add_input_port_information(transformed_mem, 1);
+    allocate_more_input_pins(transformed_mem, 1);
+    for (int i = 0; i < RD_CLK_width; i++) {
+        if (i == 0) {
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, 0);
+        } else {
+            /* delete extra pins */
+            delete_npin(node->input_pins[i + offset]);
+        }
+    }
+    new_offset += 1;
+
+    /* RD_ADDR */
+    offset = 0;
+    oassert(RD_ADDR_width == num_rd_ports * addr_width);
+    add_input_port_information(transformed_mem, RD_ADDR_width);
+    allocate_more_input_pins(transformed_mem, RD_ADDR_width);
+    for (int i = 0; i < RD_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ADDR_width;
+
+    /* RD_ENABLE */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width;
+    oassert(RD_ENABLE_width == num_rd_ports);
+    add_input_port_information(transformed_mem, RD_ENABLE_width);
+    allocate_more_input_pins(transformed_mem, RD_ENABLE_width);
+    for (int i = 0; i < RD_ENABLE_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ENABLE_width;
+
+    /* RD_DATA */
+    offset = 0;
+    oassert(RD_DATA_width == num_rd_ports * data_width);
+    add_output_port_information(transformed_mem, RD_DATA_width);
+    allocate_more_output_pins(transformed_mem, RD_DATA_width);
+    for (int i = 0; i < RD_DATA_width; i++) {
+        remap_pin_to_new_node(node->output_pins[i + offset], transformed_mem, i);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return (transformed_mem);
+}
+/**
+ * (function: ymem_to_bram)
+ *
+ * @brief map ymem to bram
+ *
+ * @param node pointing to a bram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ */
+static nnode_t *ymem_to_bram(nnode_t *node, uintptr_t traverse_mark_number)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    int offset, new_offset = 0;
+    int addr_width = node->attributes->ABITS;
+    int data_width = node->attributes->DBITS;
+    int num_rd_ports = node->attributes->RD_PORTS;
+    int num_wr_ports = node->attributes->WR_PORTS;
+
+    int RD_ADDR_width = node->input_port_sizes[0];
+    int RD_CLK_width = node->input_port_sizes[1];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[2];
+    int WR_ADDR_width = node->input_port_sizes[3];
+    int WR_CLK_width = node->input_port_sizes[4];
+    int WR_DATA_width = node->input_port_sizes[5];
+    int WR_ENABLE_width = node->input_port_sizes[6];
+
+    /* check for BRAM */
+    oassert(node->num_input_port_sizes == 7);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* create BRAM node */
+    nnode_t *transformed_mem = allocate_nnode(node->loc);
+    transformed_mem->traverse_visited = traverse_mark_number;
+    transformed_mem->type = BRAM;
+    copy_attribute(transformed_mem->attributes, node->attributes);
+    transformed_mem->name = node_name(transformed_mem, node->name);
+    transformed_mem->related_ast_node = node->related_ast_node;
+    /**
+     * BRAM information
+     *
+     * RD_ADDR:    input port [0]  ==> CLK:        input port [0]
+     * RD_CLK:     input port [1]  ==> RD_ADDR:    input port [1]
+     *   RD_DATA:    output port[0]  ==> RD_DATA:    output port[0]
+     * RD_ENABLE:  input port [2]  ==> RD_ENABLE:  input port [2]
+     * WR_ADDR:    input port [3]  ==> WR_ADDR:    input port [3]
+     * WR_CLK:     input port [4]  ==> WR_DATA:    input port [4]
+     * WR_DATA:    input port [5]  ==> WR_ENABLE:  input port [5]
+     * WR_ENABLE:  input port [6]
+     */
+
+    /* CLK */
+    offset = RD_ADDR_width + RD_CLK_width + RD_ENABLE_width + WR_ADDR_width;
+    add_input_port_information(transformed_mem, 1);
+    allocate_more_input_pins(transformed_mem, 1);
+    for (int i = 0; i < WR_CLK_width; i++) {
+        if (i == 0) {
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, 0);
+        } else {
+            /* delete extra pins */
+            delete_npin(node->input_pins[i + offset]);
+        }
+    }
+    new_offset += 1;
+
+    /* RD_ADDR */
+    offset = 0;
+    oassert(RD_ADDR_width == num_rd_ports * addr_width);
+    add_input_port_information(transformed_mem, RD_ADDR_width);
+    allocate_more_input_pins(transformed_mem, RD_ADDR_width);
+    for (int i = 0; i < RD_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ADDR_width;
+
+    /* RD_CLK */
+    offset = RD_ADDR_width;
+    oassert(RD_CLK_width == num_rd_ports);
+    for (int i = 0; i < RD_CLK_width; i++) {
+        delete_npin(node->input_pins[i + offset]);
+    }
+
+    /* RD_ENABLE */
+    offset = RD_ADDR_width + RD_CLK_width;
+    oassert(RD_ENABLE_width == num_rd_ports);
+    add_input_port_information(transformed_mem, RD_ENABLE_width);
+    allocate_more_input_pins(transformed_mem, RD_ENABLE_width);
+    for (int i = 0; i < RD_ENABLE_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ENABLE_width;
+
+    /* WR_ADDR */
+    offset = RD_ADDR_width + RD_CLK_width + RD_ENABLE_width;
+    oassert(WR_ADDR_width == num_wr_ports * addr_width);
+    add_input_port_information(transformed_mem, WR_ADDR_width);
+    allocate_more_input_pins(transformed_mem, WR_ADDR_width);
+    for (int i = 0; i < WR_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += WR_ADDR_width;
+
+    /* WR_DATA */
+    offset = RD_ADDR_width + RD_CLK_width + RD_ENABLE_width + WR_ADDR_width + WR_CLK_width;
+    oassert(WR_DATA_width == num_wr_ports * data_width);
+    add_input_port_information(transformed_mem, WR_DATA_width);
+    allocate_more_input_pins(transformed_mem, WR_DATA_width);
+    for (int i = 0; i < WR_DATA_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += WR_DATA_width;
+
+    /* WR_ENABLE */
+    offset = RD_ADDR_width + RD_CLK_width + RD_ENABLE_width + WR_ADDR_width + WR_CLK_width + WR_DATA_width;
+    oassert(WR_ENABLE_width == num_wr_ports * data_width);
+    add_input_port_information(transformed_mem, num_wr_ports);
+    allocate_more_input_pins(transformed_mem, num_wr_ports);
+    for (int i = 0; i < WR_ENABLE_width; i++) {
+        if (i % data_width == 0)
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, new_offset++);
+        else
+            delete_npin(node->input_pins[i + offset]);
+    }
+
+    /* RD_DATA */
+    offset = 0;
+    oassert(RD_DATA_width == num_rd_ports * data_width);
+    add_output_port_information(transformed_mem, RD_DATA_width);
+    allocate_more_output_pins(transformed_mem, RD_DATA_width);
+    for (int i = 0; i < RD_DATA_width; i++) {
+        remap_pin_to_new_node(node->output_pins[i + offset], transformed_mem, i);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return (transformed_mem);
+}
+
+static nnode_t *ymem2_to_bram(nnode_t *node, uintptr_t traverse_mark_number)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    int offset, new_offset = 0;
+    int addr_width = node->attributes->ABITS;
+    int data_width = node->attributes->DBITS;
+    int num_rd_ports = node->attributes->RD_PORTS;
+    int num_wr_ports = node->attributes->WR_PORTS;
+
+    int RD_ADDR_width = node->input_port_sizes[0];
+    int RD_ARST_width = node->input_port_sizes[1];
+    int RD_CLK_width = node->input_port_sizes[2];
+    int RD_DATA_width = node->output_port_sizes[0];
+    int RD_ENABLE_width = node->input_port_sizes[3];
+    int RD_SRST_width = node->input_port_sizes[4];
+    int WR_ADDR_width = node->input_port_sizes[5];
+    int WR_CLK_width = node->input_port_sizes[6];
+    int WR_DATA_width = node->input_port_sizes[7];
+    int WR_ENABLE_width = node->input_port_sizes[8];
+
+    /* check for BRAM */
+    oassert(node->num_input_port_sizes == 9);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* create BRAM node */
+    nnode_t *transformed_mem = allocate_nnode(node->loc);
+    transformed_mem->traverse_visited = traverse_mark_number;
+    transformed_mem->type = BRAM;
+    copy_attribute(transformed_mem->attributes, node->attributes);
+    transformed_mem->name = node_name(transformed_mem, node->name);
+    transformed_mem->related_ast_node = node->related_ast_node;
+
+    /* ARST */
+    offset = RD_ADDR_width;
+    for (int i = 0; i < RD_ARST_width; i++) {
+        delete_npin(node->input_pins[offset + i]);
+    }
+    /* SRST */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width;
+    for (int i = 0; i < RD_SRST_width; i++) {
+        delete_npin(node->input_pins[offset + i]);
+    }
+
+    /* CLK */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width + RD_SRST_width + WR_ADDR_width;
+    add_input_port_information(transformed_mem, 1);
+    allocate_more_input_pins(transformed_mem, 1);
+    for (int i = 0; i < WR_CLK_width; i++) {
+        if (i == 0) {
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, 0);
+        } else {
+            /* delete extra pins */
+            delete_npin(node->input_pins[i + offset]);
+        }
+    }
+    new_offset += 1;
+
+    /* RD_ADDR */
+    offset = 0;
+    oassert(RD_ADDR_width == num_rd_ports * addr_width);
+    add_input_port_information(transformed_mem, RD_ADDR_width);
+    allocate_more_input_pins(transformed_mem, RD_ADDR_width);
+    for (int i = 0; i < RD_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ADDR_width;
+
+    /* RD_CLK */
+    offset = RD_ADDR_width + RD_ARST_width;
+    oassert(RD_CLK_width == num_rd_ports);
+    for (int i = 0; i < RD_CLK_width; i++) {
+        delete_npin(node->input_pins[i + offset]);
+    }
+
+    /* RD_ENABLE */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width;
+    oassert(RD_ENABLE_width == num_rd_ports);
+    add_input_port_information(transformed_mem, RD_ENABLE_width);
+    allocate_more_input_pins(transformed_mem, RD_ENABLE_width);
+    for (int i = 0; i < RD_ENABLE_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += RD_ENABLE_width;
+
+    /* WR_ADDR */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width + RD_SRST_width;
+    oassert(WR_ADDR_width == num_wr_ports * addr_width);
+    add_input_port_information(transformed_mem, WR_ADDR_width);
+    allocate_more_input_pins(transformed_mem, WR_ADDR_width);
+    for (int i = 0; i < WR_ADDR_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += WR_ADDR_width;
+
+    /* WR_DATA */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width + RD_SRST_width + WR_ADDR_width + WR_CLK_width;
+    oassert(WR_DATA_width == num_wr_ports * data_width);
+    add_input_port_information(transformed_mem, WR_DATA_width);
+    allocate_more_input_pins(transformed_mem, WR_DATA_width);
+    for (int i = 0; i < WR_DATA_width; i++) {
+        remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, i + new_offset);
+    }
+    new_offset += WR_DATA_width;
+
+    /* WR_ENABLE */
+    offset = RD_ADDR_width + RD_ARST_width + RD_CLK_width + RD_ENABLE_width + RD_SRST_width + WR_ADDR_width + WR_CLK_width + WR_DATA_width;
+    oassert(WR_ENABLE_width == num_wr_ports * data_width);
+    add_input_port_information(transformed_mem, num_wr_ports);
+    allocate_more_input_pins(transformed_mem, num_wr_ports);
+    for (int i = 0; i < WR_ENABLE_width; i++) {
+        if (i % data_width == 0)
+            remap_pin_to_new_node(node->input_pins[i + offset], transformed_mem, new_offset++);
+        else
+            delete_npin(node->input_pins[i + offset]);
+    }
+
+    /* RD_DATA */
+    offset = 0;
+    oassert(RD_DATA_width == num_rd_ports * data_width);
+    add_output_port_information(transformed_mem, RD_DATA_width);
+    allocate_more_output_pins(transformed_mem, RD_DATA_width);
+    for (int i = 0; i < RD_DATA_width; i++) {
+        remap_pin_to_new_node(node->output_pins[i + offset], transformed_mem, i);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return (transformed_mem);
+}
+
+/**
+ * (function: perform_optimization)
+ *
+ * @brief this function is to perform optimization on block
+ * memories and roms. Optimization includes address width
+ * reduction base on the mem size
+ *
+ * @param bram pointer to the block memory
+ */
+static void perform_optimization(block_memory_t *memory)
+{
+    nnode_t *node = memory->node;
+    int depth = node->attributes->size;
+    int addr_width = node->attributes->ABITS;
+    int rd_ports = node->attributes->RD_PORTS;
+    int wr_ports = node->attributes->WR_PORTS;
+
+    int needed_addr_width = 0;
+    long shifted_value = 0;
+    /* calculate the needed address width */
+    while (shifted_value < depth) {
+        needed_addr_width++;
+        shifted_value = shift_left_value_with_overflow_check(0X1, needed_addr_width, node->loc);
+    }
+
+    /**
+     * [NOTE]: At this point there is a need to take care of multiple read
+     * or write address pins (like dual port ram) since in the case of having
+     * arithmetic operation in address pins (cause to extend addr to 32 bits),
+     * yosys would NOT handle it before creting the mem subcircuit.
+     * e.g.:
+     *
+     *      wire [3:0] read_addr;
+     *      read_data <= mem[read_addr / 3]
+     *
+     * the signal representing the read addr signal in $mem subcircuit is 32 bits.
+     * However, we only need four bits
+     */
+
+    /* check if block memory has a read addr */
+    if (memory->read_addr) {
+        /* prune read address to reduce its width based on the needed addr width if needed */
+        memory->read_addr = prune_signal(memory->read_addr, addr_width, needed_addr_width, rd_ports);
+    }
+
+    /* check if block memory has a write addr */
+    if (memory->write_addr) {
+        /* prune write address to reduce its width based on the needed addr width if needed */
+        memory->write_addr = prune_signal(memory->write_addr, addr_width, needed_addr_width, wr_ports);
+    }
+
+    /* update new read addr width */
+    node->attributes->ABITS = needed_addr_width;
+}
+
+/**
+ * (function: split_cascade_port)
+ *
+ * @brief split the given signal list into chunks of desired_width size.
+ * Then, cascade them with selectors pin. In this function, assumed that
+ * the order of selectors is matched to the order of signals
+ *
+ * @param signalvar list of signals (like write data)
+ * @param selectors list of selectors (like write enables)
+ * @param desired_width final output width
+ * @param node pointer to the corresponding node
+ * @param netlist pointer to the current netlist
+ *
+ * @return last item outputs in the chain of cascaded signals
+ */
+static signal_list_t *split_cascade_port(signal_list_t *signalvar, signal_list_t *selectors, int desired_width, nnode_t *node, netlist_t *netlist)
+{
+    /* check if cascade is needed */
+    if (signalvar->count == desired_width) {
+        return (signalvar);
+    }
+
+    /* validate signals list size */
+    oassert(signalvar->count % desired_width == 0);
+
+    int num_chunk = signalvar->count / desired_width;
+    signal_list_t *return_value = NULL;
+    /* validate selector size */
+    oassert(selectors->count == num_chunk);
+
+    /* initialize splitted signals */
+    signal_list_t **splitted_signals = split_signal_list(signalvar, desired_width);
+
+    /* create cascaded multiplexers */
+    nnode_t **muxes = (nnode_t **)vtr::calloc(num_chunk, sizeof(nnode_t *));
+    signal_list_t **internal_outputs = (signal_list_t **)vtr::calloc(num_chunk, sizeof(signal_list_t *));
+    for (int i = 0; i < num_chunk; ++i) {
+        /* mux inputs */
+        signal_list_t **mux_inputs = (signal_list_t **)vtr::calloc(2, sizeof(signal_list_t *));
+        mux_inputs[0] = init_signal_list();
+        if (i == 0) {
+            /* the first port of the first mux should be driven by PAD node */
+            for (int j = 0; j < desired_width; j++) {
+                add_pin_to_signal_list(mux_inputs[0], get_pad_pin(netlist));
+            }
+        } else {
+            /* the first port of the rest muxe should be driven by previous mux output */
+            for (int j = 0; j < desired_width; j++) {
+                add_pin_to_signal_list(mux_inputs[0], internal_outputs[i - 1]->pins[j]);
+            }
+        }
+
+        /* hook the splitted signals[i] as the second mux input */
+        mux_inputs[1] = init_signal_list();
+        for (int j = 0; j < desired_width; j++) {
+            add_pin_to_signal_list(mux_inputs[1], splitted_signals[i]->pins[j]);
+        }
+
+        /* handle mux selector and create the multiplexer */
+        {
+            /* create a signal list for selector pin */
+            signal_list_t *selector_i = init_signal_list();
+            add_pin_to_signal_list(selector_i, selectors->pins[i]);
+            /* a regular multiplexer instatiation */
+            muxes[i] = make_multiport_smux(mux_inputs, selector_i, 2, NULL, node, netlist);
+
+            // CLEAN UP
+            free_signal_list(selector_i);
+        }
+
+        /* initialize the internal outputs */
+        internal_outputs[i] = init_signal_list();
+        for (int j = 0; j < desired_width; j++) {
+            npin_t *output_pin = muxes[i]->output_pins[j];
+            nnet_t *output_net = output_pin->net;
+            /* add new fanout */
+            npin_t *new_pin = allocate_npin();
+            add_fanout_pin_to_net(output_net, new_pin);
+            /* keep the record of the new pin as internal outputs */
+            add_pin_to_signal_list(internal_outputs[i], new_pin);
+        }
+
+        // CLEAN UP
+        free_signal_list(mux_inputs[0]);
+        free_signal_list(mux_inputs[1]);
+        vtr::free(mux_inputs);
+    }
+
+    return_value = internal_outputs[num_chunk - 1];
+
+    // CLEAN UP
+    for (int i = 0; i < num_chunk; ++i) {
+        free_signal_list(splitted_signals[i]);
+    }
+    vtr::free(splitted_signals);
+
+    /* free internal output signal list expect the last one since it is the return value */
+    for (int i = 0; i < num_chunk - 1; ++i) {
+        free_signal_list(internal_outputs[i]);
+    }
+    vtr::free(internal_outputs);
+    vtr::free(muxes);
+
+    return (return_value);
+}
+
+/**
+ * (function: decode_out_port)
+ *
+ * @brief decode the memory outputs to the n output ports
+ *
+ * @param src the mux input that will pass if en is 1
+ * @param outs list of signals (like write data)
+ * @param selectors list of selectors (like write enables)
+ * @param node pointer to the corresponding node
+ * @param netlist pointer to the current netlist
+ */
+static void decode_out_port(signal_list_t *src, signal_list_t *outs, signal_list_t *selectors, nnode_t *node, netlist_t *netlist)
+{
+    int width = src->count;
+    /* validate signals list size */
+    oassert(width != 0);
+    oassert(outs->count % width == 0);
+
+    int num_chunk = outs->count / width;
+
+    /* initialize splitted signals */
+    signal_list_t **splitted_signals = split_signal_list(outs, width);
+
+    /* validate selector size */
+    oassert(selectors->count == num_chunk);
+
+    /* adding fanout pins to src pin nets */
+    signal_list_t **src_nets_fanouts = (signal_list_t **)vtr::calloc(width, sizeof(signal_list_t *));
+    /* create the n fanout pin for src pins, since they are output pins of a memory */
+    for (int i = 0; i < width; ++i) {
+        npin_t *src_pin = src->pins[i];
+        /* validate that it is output */
+        oassert(src_pin->type == OUTPUT);
+
+        /* init the related sig list */
+        src_nets_fanouts[i] = init_signal_list();
+        /* add fanouts */
+        for (int j = 0; j < num_chunk; j++) {
+            npin_t *new_pin = allocate_npin();
+            /* adding fanout pin to the src_pin net */
+            add_fanout_pin_to_net(src_pin->net, new_pin);
+            /* keep the record of the newly added pin */
+            add_pin_to_signal_list(src_nets_fanouts[i], new_pin);
+        }
+    }
+
+    /* create multiplexers */
+    nnode_t **muxes = (nnode_t **)vtr::calloc(num_chunk, sizeof(nnode_t *));
+    for (int i = 0; i < num_chunk; ++i) {
+        /* mux inputs */
+        signal_list_t **mux_inputs = (signal_list_t **)vtr::calloc(2, sizeof(signal_list_t *));
+        mux_inputs[0] = init_signal_list();
+        /* the first port of the first mux should be driven by PAD node */
+        for (int j = 0; j < width; j++) {
+            add_pin_to_signal_list(mux_inputs[0], get_pad_pin(netlist));
+        }
+
+        /* hook the splitted signals[i] as the second mux input */
+        mux_inputs[1] = init_signal_list();
+        for (int j = 0; j < width; j++) {
+            add_pin_to_signal_list(mux_inputs[1], src_nets_fanouts[j]->pins[i]);
+        }
+
+        /* handle mux selector and create the multiplexer */
+        {
+            /* create a signal list for selector pin */
+            signal_list_t *selector_i = init_signal_list();
+            add_pin_to_signal_list(selector_i, selectors->pins[i]);
+            /* a regular multiplexer instatiation */
+            muxes[i] = make_multiport_smux(mux_inputs, selector_i, 2, splitted_signals[i], node, netlist);
+
+            // CLEAN UP
+            free_signal_list(selector_i);
+        }
+
+        // CLEAN UP
+        free_signal_list(mux_inputs[0]);
+        free_signal_list(mux_inputs[1]);
+        vtr::free(mux_inputs);
+    }
+
+    // CLEAN UP
+    for (int i = 0; i < num_chunk; ++i) {
+        free_signal_list(splitted_signals[i]);
+    }
+    vtr::free(splitted_signals);
+    for (int i = 0; i < width; ++i) {
+        free_signal_list(src_nets_fanouts[i]);
+    }
+    vtr::free(src_nets_fanouts);
+    vtr::free(muxes);
+}
+
+/**
+ * (function: cleanup_block_memory_old_node)
+ *
+ * @brief Frees memory used for indexing block memories.
+ *
+ * @param node pointer to the old node
+ */
+static void cleanup_block_memory_old_node(nnode_t *old_node)
+{
+    for (int i = 0; i < old_node->num_input_pins; ++i) {
+        npin_t *pin = old_node->input_pins[i];
+
+        if (pin)
+            old_node->input_pins[i] = NULL;
+    }
+
+    for (int i = 0; i < old_node->num_output_pins; ++i) {
+        npin_t *pin = old_node->output_pins[i];
+
+        if (pin)
+            old_node->output_pins[i] = NULL;
+    }
+
+    /* clean up */
+    free_nnode(old_node);
+}
+
+/**
+ * (function: free_block_memory_indices)
+ *
+ * @brief Frees memory used for indexing block memories.
+ */
+void free_block_memories()
+{
+    /* check if any block memory indexed */
+    if (block_memories_info.block_memory_list) {
+        free_block_memory_index(block_memories_info.block_memories);
+        while (block_memories_info.block_memory_list != NULL)
+            block_memories_info.block_memory_list = delete_in_vptr_list(block_memories_info.block_memory_list);
+    }
+    /* check if any read only memory indexed */
+    if (block_memories_info.read_only_memory_list) {
+        free_block_memory_index(block_memories_info.read_only_memories);
+        while (block_memories_info.read_only_memory_list != NULL)
+            block_memories_info.read_only_memory_list = delete_in_vptr_list(block_memories_info.read_only_memory_list);
+    }
+}
+
+/**
+ * (function: free_block_memory_index)
+ *
+ * @brief Frees memory used for indexing block memories. Finalises each
+ * memory, making sure it has the right ports, and collapsing
+ * the memory if possible.
+ *
+ * @param to_free to be freed block memory hashtable
+ */
+void free_block_memory_index(block_memory_hashtable to_free)
+{
+    if (!to_free.empty()) {
+        for (auto mem_it : to_free) {
+            free_block_memory(mem_it.second);
+        }
+    }
+    to_free.clear();
+}
+
+/**
+ * (function: free_block_memory_index_and_finalize_memories)
+ *
+ * @brief Frees memory used for indexing block memories. Finalises each
+ * memory, making sure it has the right ports, and collapsing
+ * the memory if possible.
+ *
+ * @param to_free to be freed block memory structure
+ */
+static void free_block_memory(block_memory_t *to_free)
+{
+    free_signal_list(to_free->read_addr);
+    free_signal_list(to_free->read_data);
+    free_signal_list(to_free->read_en);
+    free_signal_list(to_free->write_addr);
+    free_signal_list(to_free->write_data);
+    free_signal_list(to_free->write_en);
+    free_signal_list(to_free->clk);
+
+    vtr::free(to_free->name);
+    vtr::free(to_free->memory_id);
+
+    vtr::free(to_free);
+}
diff --git a/parmys-plugin/core/block_memory.h b/parmys-plugin/core/block_memory.h
new file mode 100644
index 000000000..0b2330d09
--- /dev/null
+++ b/parmys-plugin/core/block_memory.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _BLOCK_MEMORY_H_
+#define _BLOCK_MEMORY_H_
+
+#include <unordered_map>
+
+// Max number of bits for register of array inference
+const int LUTRAM_INFERENCE_THRESHOLD_MIN = 80;  // Max number of bits for LUTRAM inference
+const int LUTRAM_INFERENCE_THRESHOLD_MAX = 640; // Min number of bits for LUTRAM inference
+
+/*
+ * Contains a pointer to the block memory node as well as other
+ * information which is used in creating the block memory.
+ */
+struct block_memory_t {
+    loc_t loc;
+    nnode_t *node;
+
+    signal_list_t *read_addr;
+    signal_list_t *read_data;
+    signal_list_t *read_en;
+
+    signal_list_t *write_addr;
+    signal_list_t *write_data;
+    signal_list_t *write_en;
+
+    signal_list_t *clk;
+
+    char *name;
+    char *memory_id;
+};
+
+typedef std::unordered_map<std::string, block_memory_t *> block_memory_hashtable;
+
+/**
+ * block memories information. variable will be invalid
+ * after iterations happen before partial mapping
+ */
+struct block_memory_information_t {
+    /**
+     * block_memory_list and read-only_memory_list linked lists
+     * include the corresponding memory instances. Each instance
+     * comprises memory signal lists, location, memory id and the
+     * corresponding netlist node. These linked lists are used in
+     * optimization iteration, including signal pruning, REG/LUTRAM
+     * threshold checking and mapping to VTR memory blocks. Once the
+     * optimization iteration is done, these linked lists are not
+     * valid anymore.
+     *
+     * [NOTE] Block memories and read-only memory both use the same
+     * structure (block_memory_t*). They only differ in terms of
+     * their member variables initialization. The naming convention
+     * is only due to the ease of the coding process.
+     */
+    vtr::t_linked_vptr *block_memory_list;
+    vtr::t_linked_vptr *read_only_memory_list;
+    /* hashtable to look up block memories faster */
+    block_memory_hashtable block_memories;
+    block_memory_hashtable read_only_memories;
+};
+extern block_memory_information_t block_memories_info;
+
+extern void init_block_memory_index();
+extern void free_block_memories();
+
+extern void resolve_ymem_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void resolve_ymem2_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void resolve_bram_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void resolve_rom_node(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+extern void iterate_block_memories(netlist_t *netlist);
+
+#endif // _BLOCK_MEMORY_H_
diff --git a/parmys-plugin/core/hard_block.cc b/parmys-plugin/core/hard_block.cc
new file mode 100644
index 000000000..a7437518f
--- /dev/null
+++ b/parmys-plugin/core/hard_block.cc
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdlib.h>
+
+#include "hard_block.h"
+#include "memory.h"
+#include "netlist_utils.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+
+#include "kernel/yosys.h"
+
+#include "parmys_utils.h"
+
+STRING_CACHE *hard_block_names = NULL;
+
+void cache_hard_block_names();
+void register_hb_port_size(t_model_ports *hb_ports, int size);
+
+void register_hb_port_size(t_model_ports *hb_ports, int size)
+{
+    if (hb_ports)
+        hb_ports->size = size;
+    /***
+     * else
+     *	TODO error
+     */
+}
+
+t_model_ports *get_model_port(t_model_ports *ports, const char *name)
+{
+    while (ports && strcmp(ports->name, name))
+        ports = ports->next;
+
+    return ports;
+}
+
+void cache_hard_block_names()
+{
+    t_model *hard_blocks = NULL;
+
+    hard_blocks = Arch.models;
+    hard_block_names = sc_new_string_cache();
+    while (hard_blocks) {
+        int sc_spot = sc_add_string(hard_block_names, hard_blocks->name);
+        hard_block_names->data[sc_spot] = (void *)hard_blocks;
+        hard_blocks = hard_blocks->next;
+    }
+}
+
+void register_hard_blocks()
+{
+    cache_hard_block_names();
+    single_port_rams = find_hard_block(SINGLE_PORT_RAM_string);
+    dual_port_rams = find_hard_block(DUAL_PORT_RAM_string);
+
+    if (single_port_rams) {
+        if (configuration.split_memory_width) {
+            register_hb_port_size(get_model_port(single_port_rams->inputs, "data"), 1);
+
+            register_hb_port_size(get_model_port(single_port_rams->outputs, "out"), 1);
+        }
+
+        register_hb_port_size(get_model_port(single_port_rams->inputs, "addr"), get_sp_ram_split_depth());
+    }
+
+    if (dual_port_rams) {
+        if (configuration.split_memory_width) {
+            register_hb_port_size(get_model_port(dual_port_rams->inputs, "data1"), 1);
+            register_hb_port_size(get_model_port(dual_port_rams->inputs, "data2"), 1);
+
+            register_hb_port_size(get_model_port(dual_port_rams->outputs, "out1"), 1);
+            register_hb_port_size(get_model_port(dual_port_rams->outputs, "out2"), 1);
+        }
+
+        int split_depth = get_dp_ram_split_depth();
+
+        register_hb_port_size(get_model_port(dual_port_rams->inputs, "addr1"), split_depth);
+        register_hb_port_size(get_model_port(dual_port_rams->inputs, "addr2"), split_depth);
+    }
+}
+
+t_model *find_hard_block(const char *name)
+{
+    t_model *hard_blocks;
+
+    hard_blocks = Arch.models;
+    while (hard_blocks)
+        if (!strcmp(hard_blocks->name, name))
+            return hard_blocks;
+        else
+            hard_blocks = hard_blocks->next;
+
+    return NULL;
+}
+
+void cell_hard_block(nnode_t *node, Yosys::Module *module, netlist_t *netlist, Yosys::Design *design)
+{
+    int index, port;
+
+    /* Assert that every hard block has at least an input and output */
+    oassert(node->input_port_sizes[0] > 0);
+    oassert(node->output_port_sizes[0] > 0);
+
+    Yosys::IdString celltype = Yosys::RTLIL::escape_id(node->related_ast_node->identifier_node->types.identifier);
+    Yosys::RTLIL::Cell *cell = module->addCell(NEW_ID, celltype);
+
+    Yosys::hashlib::dict<Yosys::RTLIL::IdString, Yosys::hashlib::dict<int, Yosys::SigBit>> cell_wideports_cache;
+
+    /* print the input port mappings */
+    port = index = 0;
+    for (int i = 0; i < node->num_input_pins; i++) {
+        /* Check that the input pin is driven */
+        if (node->input_pins[i]->net->num_driver_pins == 0 && node->input_pins[i]->net != netlist->zero_net &&
+            node->input_pins[i]->net != netlist->one_net && node->input_pins[i]->net != netlist->pad_net) {
+            warning_message(NETLIST, node->loc, "Signal %s is not driven. padding with ground\n", node->input_pins[i]->name);
+            add_fanout_pin_to_net(netlist->zero_net, node->input_pins[i]);
+        } else if (node->input_pins[i]->net->num_driver_pins > 1) {
+            error_message(NETLIST, node->loc, "Multiple (%d) driver pins not supported in hard block definition\n",
+                          node->input_pins[i]->net->num_driver_pins);
+        }
+        std::string p, q;
+
+        if (node->input_port_sizes[port] == 1) {
+            p = node->input_pins[i]->mapping;
+            if (node->input_pins[i]->net->driver_pins[0]->name != NULL)
+                q = node->input_pins[i]->net->driver_pins[0]->name;
+            else
+                q = node->input_pins[i]->net->driver_pins[0]->node->name;
+        } else {
+            p = Yosys::stringf("%s[%d]", node->input_pins[i]->mapping, index);
+            if (node->input_pins[i]->net->driver_pins[0]->name != NULL)
+                q = node->input_pins[i]->net->driver_pins[0]->name;
+            else
+                q = node->input_pins[i]->net->driver_pins[0]->node->name;
+        }
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+
+        index++;
+        if (node->input_port_sizes[port] == index) {
+            index = 0;
+            port++;
+        }
+    }
+
+    /* print the output port mappings */
+    port = index = 0;
+    for (int i = 0; i < node->num_output_pins; i++) {
+        std::string p, q;
+        if (node->output_port_sizes[port] != 1) {
+            p = Yosys::stringf("%s[%d]", node->output_pins[i]->mapping, index);
+            q = node->output_pins[i]->name;
+        } else {
+            p = node->output_pins[i]->mapping;
+            q = node->output_pins[i]->name;
+        }
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+
+        index++;
+        if (node->output_port_sizes[port] == index) {
+            index = 0;
+            port++;
+        }
+    }
+
+    handle_cell_wideports_cache(&cell_wideports_cache, design, module, cell);
+
+    for (auto &param : node->cell_parameters) {
+        cell->parameters[Yosys::RTLIL::IdString(param.first)] = Yosys::Const(param.second);
+    }
+
+    return;
+}
+
+void output_hard_blocks_yosys(Yosys::Design *design)
+{
+    t_model_ports *hb_ports;
+    t_model *hard_blocks;
+
+    hard_blocks = Arch.models;
+    while (hard_blocks != NULL) {
+        if (hard_blocks->used == 1) /* Hard Block is utilized */
+        {
+            // IF the hard_blocks is an adder or a multiplier, we ignore it.(Already print out in add_the_blackbox_for_adds and
+            // add_the_blackbox_for_mults)
+            if (strcmp(hard_blocks->name, "adder") == 0 || strcmp(hard_blocks->name, "multiply") == 0) {
+                hard_blocks = hard_blocks->next;
+                break;
+            }
+
+            Yosys::RTLIL::Module *module = nullptr;
+
+            Yosys::hashlib::dict<Yosys::RTLIL::IdString, std::pair<int, bool>> wideports_cache;
+
+            module = new Yosys::RTLIL::Module;
+            module->name = Yosys::RTLIL::escape_id(hard_blocks->name);
+
+            if (design->module(module->name))
+                Yosys::log_error("Duplicate definition of module %s!\n", Yosys::log_id(module->name));
+            design->add(module);
+
+            hb_ports = hard_blocks->inputs;
+            while (hb_ports != NULL) {
+                for (int i = 0; i < hb_ports->size; i++) {
+                    std::string w_name;
+                    if (hb_ports->size == 1)
+                        w_name = hb_ports->name;
+                    else
+                        w_name = Yosys::stringf("%s[%d]", hb_ports->name, i);
+
+                    Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+                    wire->port_input = true;
+
+                    std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+                    if (!wp.first.empty() && wp.second >= 0) {
+                        wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                        wideports_cache[wp.first].second = true;
+                    }
+                }
+
+                hb_ports = hb_ports->next;
+            }
+
+            // fprintf(out, "\n.outputs");
+            hb_ports = hard_blocks->outputs;
+            while (hb_ports != NULL) {
+                for (int i = 0; i < hb_ports->size; i++) {
+                    std::string w_name;
+                    if (hb_ports->size == 1)
+                        w_name = hb_ports->name;
+                    else
+                        w_name = Yosys::stringf("%s[%d]", hb_ports->name, i);
+
+                    Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+                    wire->port_output = true;
+
+                    std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+                    if (!wp.first.empty() && wp.second >= 0) {
+                        wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                        wideports_cache[wp.first].second = false;
+                    }
+                }
+
+                hb_ports = hb_ports->next;
+            }
+
+            handle_wideports_cache(&wideports_cache, module);
+
+            module->fixup_ports();
+            wideports_cache.clear();
+
+            module->attributes[Yosys::ID::blackbox] = Yosys::RTLIL::Const(1);
+        }
+
+        hard_blocks = hard_blocks->next;
+    }
+
+    return;
+}
+
+void instantiate_hard_block(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    int port, index;
+
+    port = index = 0;
+    /* Give names to the output pins */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        if (node->output_pins[i]->name == NULL)
+            node->output_pins[i]->name = make_full_ref_name(node->name, NULL, NULL, node->output_pins[i]->mapping, i);
+        // node->output_pins[i]->name = make_full_ref_name(node->name, NULL, NULL, node->output_pins[i]->mapping,
+        // (configuration.elaborator_type == elaborator_e::_YOSYS) ? i : -1); //@TODO
+
+        index++;
+        if (node->output_port_sizes[port] == index) {
+            index = 0;
+            port++;
+        }
+    }
+
+    node->traverse_visited = mark;
+    return;
+}
diff --git a/parmys-plugin/core/hard_block.h b/parmys-plugin/core/hard_block.h
new file mode 100644
index 000000000..6976ab1a5
--- /dev/null
+++ b/parmys-plugin/core/hard_block.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _HARD_BLOCK_H_
+#define _HARD_BLOCK_H_
+
+#include "odin_types.h"
+
+extern STRING_CACHE *hard_block_names;
+
+void register_hard_blocks();
+t_model *find_hard_block(const char *name);
+void cell_hard_block(nnode_t *node, Yosys::Module *module, netlist_t *netlist, Yosys::Design *design);
+void output_hard_blocks_yosys(Yosys::Design *design);
+void instantiate_hard_block(nnode_t *node, short mark, netlist_t *netlist);
+t_model_ports *get_model_port(t_model_ports *ports, const char *name);
+
+#endif // _HARD_BLOCK_H_
diff --git a/parmys-plugin/core/memory.cc b/parmys-plugin/core/memory.cc
new file mode 100644
index 000000000..d9a915ce9
--- /dev/null
+++ b/parmys-plugin/core/memory.cc
@@ -0,0 +1,2243 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "ast_util.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include <math.h>
+#include <string.h>
+
+#include "hard_block.h"
+#include "memory.h"
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "partial_map.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+using vtr::t_linked_vptr;
+
+USING_YOSYS_NAMESPACE
+
+t_model *single_port_rams = NULL;
+t_model *dual_port_rams = NULL;
+
+t_linked_vptr *sp_memory_list;
+t_linked_vptr *dp_memory_list;
+
+void copy_input_port_to_memory(nnode_t *node, signal_list_t *signals, const char *port_name);
+void pad_dp_memory_width(nnode_t *node, netlist_t *netlist);
+void pad_sp_memory_width(nnode_t *node, netlist_t *netlist);
+void pad_memory_output_port(nnode_t *node, netlist_t *netlist, t_model *model, const char *port_name);
+void pad_memory_input_port(nnode_t *node, netlist_t *netlist, t_model *model, const char *port_name);
+
+int get_sp_ram_split_width();
+int get_dp_ram_split_width();
+void filter_memories_by_soft_logic_cutoff();
+
+/**
+ * (function: init_sp_ram_signals)
+ *
+ * @brief initialize sp ram signal lists
+ */
+sp_ram_signals *init_sp_ram_signals()
+{
+    sp_ram_signals *signals = (sp_ram_signals *)vtr::malloc(sizeof(sp_ram_signals));
+
+    signals->addr = init_signal_list();
+    signals->data = init_signal_list();
+    signals->out = init_signal_list();
+
+    return (signals);
+}
+
+/**
+ * (function: init_dp_ram_signals)
+ *
+ * @brief initialize dp ram signal lists
+ */
+dp_ram_signals *init_dp_ram_signals()
+{
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    signals->addr1 = init_signal_list();
+    signals->addr2 = init_signal_list();
+    signals->data1 = init_signal_list();
+    signals->data2 = init_signal_list();
+    signals->out1 = init_signal_list();
+    signals->out2 = init_signal_list();
+
+    return (signals);
+}
+
+long get_sp_ram_depth(nnode_t *node)
+{
+    sp_ram_signals *signals = get_sp_ram_signals(node);
+    long depth = shift_left_value_with_overflow_check(0x1, signals->addr->count, node->loc);
+    free_sp_ram_signals(signals);
+    return depth;
+}
+
+long get_dp_ram_depth(nnode_t *node)
+{
+    dp_ram_signals *signals = get_dp_ram_signals(node);
+    oassert(signals->addr1->count == signals->addr2->count);
+    long depth = shift_left_value_with_overflow_check(0x1, signals->addr1->count, node->loc);
+    free_dp_ram_signals(signals);
+    return depth;
+}
+
+long get_sp_ram_width(nnode_t *node)
+{
+    sp_ram_signals *signals = get_sp_ram_signals(node);
+    long width = signals->data->count;
+    free_sp_ram_signals(signals);
+    return width;
+}
+
+long get_dp_ram_width(nnode_t *node)
+{
+    dp_ram_signals *signals = get_dp_ram_signals(node);
+    oassert(signals->data1->count == signals->data2->count);
+    long width = signals->data1->count;
+    free_dp_ram_signals(signals);
+    return width;
+}
+
+void copy_input_port_to_memory(nnode_t *node, signal_list_t *signalsvar, const char *port_name)
+{
+    signal_list_t *temp = copy_input_signals(signalsvar);
+    add_input_port_to_memory(node, temp, port_name);
+    free_signal_list(temp);
+}
+
+/*
+ * Re-maps the given input signals to the given port name on the given memory node.
+ */
+void remap_input_port_to_memory(nnode_t *node, signal_list_t *signals, const char *port_name)
+{
+    int j = node->num_input_pins;
+
+    // Make sure the port is not already assigned.
+    for (int i = 0; i < j; i++) {
+        npin_t *pin = node->input_pins[i];
+        if (!strcmp(pin->mapping, port_name)) {
+            error_message(NETLIST, node->loc, "Attempted to reassign output port %s to memory %s.", port_name, node->name);
+        }
+    }
+
+    // Make room for the new port.
+    allocate_more_input_pins(node, signals->count);
+    add_input_port_information(node, signals->count);
+
+    // Add the new port.
+    for (int i = 0; i < signals->count; i++, j++) {
+        npin_t *pin = signals->pins[i];
+        if (strcmp(pin->mapping, port_name)) {
+            if (pin->mapping)
+                vtr::free(pin->mapping);
+            pin->mapping = vtr::strdup(port_name);
+        }
+        remap_pin_to_new_node(pin, node, j);
+    }
+}
+
+/*
+ * Adds an input port with the given name and signals to the given memory node.
+ *
+ * Only allows each port to be added once.
+ */
+void add_input_port_to_memory(nnode_t *node, signal_list_t *signalsvar, const char *port_name)
+{
+    int j = node->num_input_pins;
+
+    // Make sure the port is not already assigned.
+    for (int i = 0; i < j; i++) {
+        npin_t *pin = node->input_pins[i];
+        if (!strcmp(pin->mapping, port_name)) {
+            error_message(NETLIST, node->loc, "Attempted to reassign input port %s to memory %s.", port_name, node->name);
+        }
+    }
+
+    // Make room for the new port.
+    allocate_more_input_pins(node, signalsvar->count);
+    add_input_port_information(node, signalsvar->count);
+
+    // Add the new port.
+    for (int i = 0; i < signalsvar->count; i++, j++) {
+        npin_t *pin = signalsvar->pins[i];
+        if (pin->mapping) {
+            vtr::free(pin->mapping);
+        }
+        pin->mapping = vtr::strdup(port_name);
+        add_input_pin_to_node(node, pin, j);
+    }
+}
+
+/*
+ * Adds an output port with the given name and the given
+ * signals to the given memory node. Only allows the same port
+ * to be added once.
+ */
+void add_output_port_to_memory(nnode_t *node, signal_list_t *signals, const char *port_name)
+{
+    int j = node->num_output_pins;
+
+    // Make sure the port is not already assigned.
+    // TODO: more complicated logic needs to be implementd this is temporary solution
+    for (int i = 0; i < j; i++) {
+        npin_t *pin = node->output_pins[i];
+        if (!strcmp(pin->mapping, port_name)) {
+            error_message(NETLIST, node->loc, "Attempted to reassign output port %s to node %s.", port_name, node->name);
+            return;
+        }
+    }
+
+    // Make room for the new port.
+    allocate_more_output_pins(node, signals->count);
+    add_output_port_information(node, signals->count);
+
+    // Add the new port.
+    for (int i = 0; i < signals->count; i++, j++) {
+        npin_t *pin = signals->pins[i];
+        if (pin->mapping) {
+            vtr::free(pin->mapping);
+        }
+        pin->mapping = vtr::strdup(port_name);
+        add_output_pin_to_node(node, pin, j);
+    }
+}
+
+/*
+ * Checks memories to ensure that they fall within sane size boundaries.
+ *
+ * Reports the memory distribution as well.
+ */
+void check_memories_and_report_distribution()
+{
+    if ((sp_memory_list == NULL) && (dp_memory_list == NULL))
+        return;
+
+    log("\nHard Logical Memory Distribution\n");
+    log("============================\n");
+
+    long total_memory_bits = 0;
+    int total_memory_block_counter = 0;
+    long memory_max_width = 0;
+    long memory_max_depth = 0;
+
+    t_linked_vptr *temp = sp_memory_list;
+    while (temp != NULL) {
+        nnode_t *node = (nnode_t *)temp->data_vptr;
+
+        long width = get_sp_ram_width(node);
+        long depth = get_sp_ram_depth(node);
+
+        if (depth > shift_left_value_with_overflow_check(0x1, HARD_RAM_ADDR_LIMIT, node->loc))
+            error_message(NETLIST, node->loc, "Memory %s of depth %zu exceeds ODIN depth bound of 2^%d.", node->name, depth, HARD_RAM_ADDR_LIMIT);
+
+        log("SPRAM: %zu width %zu depth\n", width, depth);
+
+        total_memory_bits += width * depth;
+
+        total_memory_block_counter++;
+
+        if (width > memory_max_width) {
+            memory_max_width = width;
+        }
+        if (depth > memory_max_depth) {
+            memory_max_depth = depth;
+        }
+
+        temp = temp->next;
+    }
+
+    temp = dp_memory_list;
+    while (temp != NULL) {
+        nnode_t *node = (nnode_t *)temp->data_vptr;
+
+        long width = get_dp_ram_width(node);
+        long depth = get_dp_ram_depth(node);
+        if (depth > shift_left_value_with_overflow_check(0x1, HARD_RAM_ADDR_LIMIT, node->loc))
+            error_message(NETLIST, node->loc, "Memory %s of depth %zu exceeds ODIN depth bound of 2^%d.", node->name, depth, HARD_RAM_ADDR_LIMIT);
+
+        log("DPRAM: %zu width %zu depth\n", width, depth);
+        total_memory_bits += width * depth;
+
+        total_memory_block_counter++;
+        if (width > memory_max_width) {
+            memory_max_width = width;
+        }
+        if (depth > memory_max_depth) {
+            memory_max_depth = depth;
+        }
+
+        temp = temp->next;
+    }
+
+    log("\nTotal Logical Memory Blocks = %d \n", total_memory_block_counter);
+    log("Total Logical Memory bits = %ld \n", total_memory_bits);
+    log("Max Memory Width = %ld \n", memory_max_width);
+    log("Max Memory Depth = %ld \n", memory_max_depth);
+    log("\n");
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_sp_memory_depth)
+ *
+ * This function works to split the depth of a single port memory into
+ *   several smaller memories.
+ *
+ *   split_size: the number of address bits in the resulting memory.
+ *------------------------------------------------------------------------
+ */
+void split_sp_memory_depth(nnode_t *node, int split_size)
+{
+    sp_ram_signals *signals = get_sp_ram_signals(node);
+
+    int logical_size = signals->addr->count;
+
+    /* Check that the memory needs to be split */
+    if (logical_size <= split_size) {
+        free_sp_ram_signals(signals);
+        sp_memory_list = insert_in_vptr_list(sp_memory_list, node);
+        return;
+    }
+
+    signal_list_t *new_addr = init_signal_list();
+    for (int i = 1; i < signals->addr->count; i++)
+        add_pin_to_signal_list(new_addr, signals->addr->pins[i]);
+
+    /* Create the new memory node */
+    nnode_t *new_mem_node1 = allocate_nnode(node->loc);
+    nnode_t *new_mem_node2 = allocate_nnode(node->loc);
+
+    // Append the new name with an __S or __H
+    new_mem_node1->name = append_string(node->name, "__S");
+    new_mem_node2->name = append_string(node->name, "__H");
+
+    /* Copy properties from the original memory node */
+    new_mem_node1->type = node->type;
+    new_mem_node1->related_ast_node = node->related_ast_node;
+    new_mem_node1->traverse_visited = node->traverse_visited;
+    new_mem_node2->type = node->type;
+    new_mem_node2->related_ast_node = node->related_ast_node;
+    new_mem_node2->traverse_visited = node->traverse_visited;
+
+    // Move over the original pins to the first memory node.
+    signal_list_t *clk = init_signal_list();
+    add_pin_to_signal_list(clk, signals->clk);
+    remap_input_port_to_memory(new_mem_node1, new_addr, "addr");
+    remap_input_port_to_memory(new_mem_node1, signals->data, "data");
+
+    // Copy the inputs to the second memory node.
+    copy_input_port_to_memory(new_mem_node2, new_addr, "addr");
+    copy_input_port_to_memory(new_mem_node2, signals->data, "data");
+
+    // Hook up addresses and write enables.
+    {
+        signal_list_t *we;
+        nnode_t *and_g = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        remap_pin_to_new_node(signals->we, and_g, 1);
+        remap_pin_to_new_node(signals->addr->pins[0], and_g, 0);
+
+        we = make_output_pins_for_existing_node(and_g, 1);
+        add_input_port_to_memory(new_mem_node1, we, "we");
+        free_signal_list(we);
+
+        nnode_t *not_g = make_not_gate_with_input(copy_input_npin(signals->addr->pins[0]), new_mem_node2, new_mem_node2->traverse_visited);
+        and_g = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node2, new_mem_node2->traverse_visited);
+        connect_nodes(not_g, 0, and_g, 0);
+
+        add_input_pin_to_node(and_g, copy_input_npin(signals->we), 1);
+
+        we = make_output_pins_for_existing_node(and_g, 1);
+        add_input_port_to_memory(new_mem_node2, we, "we");
+        free_signal_list(we);
+    }
+
+    // Add the clock signals.
+    remap_input_port_to_memory(new_mem_node1, clk, "clk");
+    copy_input_port_to_memory(new_mem_node2, clk, "clk");
+    free_signal_list(clk);
+
+    // Setup output ports on both nodes.
+    allocate_more_output_pins(new_mem_node1, signals->out->count);
+    add_output_port_information(new_mem_node1, signals->out->count);
+
+    allocate_more_output_pins(new_mem_node2, signals->out->count);
+    add_output_port_information(new_mem_node2, signals->out->count);
+
+    /* Copy over the output pins for the new memory */
+    for (int i = 0; i < signals->data->count; i++) {
+        nnode_t *mux = make_2port_gate(MUX_2, 2, 2, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        nnode_t *not_g = make_not_gate(new_mem_node1, new_mem_node1->traverse_visited);
+        add_input_pin_to_node(mux, copy_input_npin(signals->addr->pins[0]), 0);
+        add_input_pin_to_node(not_g, copy_input_npin(signals->addr->pins[0]), 0);
+        connect_nodes(not_g, 0, mux, 1);
+
+        npin_t *pin = signals->out->pins[i];
+        if (pin->name)
+            vtr::free(pin->name);
+        pin->name = mux->name;
+
+        if (pin->mapping)
+            vtr::free(pin->mapping);
+        pin->mapping = NULL;
+
+        remap_pin_to_new_node(pin, mux, 0);
+
+        connect_nodes(new_mem_node1, i, mux, 2);
+        if (new_mem_node1->output_pins[i]->mapping) {
+            vtr::free(new_mem_node1->output_pins[i]->mapping);
+        }
+        new_mem_node1->output_pins[i]->mapping = vtr::strdup("out");
+
+        connect_nodes(new_mem_node2, i, mux, 3);
+        if (new_mem_node2->output_pins[i]->mapping) {
+            vtr::free(new_mem_node2->output_pins[i]->mapping);
+        }
+        new_mem_node2->output_pins[i]->mapping = vtr::strdup("out");
+    }
+
+    free_sp_ram_signals(signals);
+    free_signal_list(new_addr);
+
+    free_nnode(node);
+
+    split_sp_memory_depth(new_mem_node1, split_size);
+    split_sp_memory_depth(new_mem_node2, split_size);
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_dp_memory_depth)
+ *
+ * This function works to split the depth of a dual port memory into
+ *   several smaller memories.
+ *------------------------------------------------------------------------
+ */
+void split_dp_memory_depth(nnode_t *node, int split_size)
+{
+    dp_ram_signals *signals = get_dp_ram_signals(node);
+
+    int logical_size = signals->addr1->count;
+
+    /* Check that the memory needs to be split */
+    if (logical_size <= split_size) {
+        free_dp_ram_signals(signals);
+        dp_memory_list = insert_in_vptr_list(dp_memory_list, node);
+        return;
+    }
+
+    signal_list_t *new_addr1 = init_signal_list();
+
+    for (int i = 1; i < signals->addr1->count; i++)
+        add_pin_to_signal_list(new_addr1, signals->addr1->pins[i]);
+
+    signal_list_t *new_addr2 = init_signal_list();
+    for (int i = 1; i < signals->addr2->count; i++)
+        add_pin_to_signal_list(new_addr2, signals->addr2->pins[i]);
+
+    /* Create the new memory node */
+    nnode_t *new_mem_node1 = allocate_nnode(node->loc);
+    nnode_t *new_mem_node2 = allocate_nnode(node->loc);
+
+    // Append the new name with an __S or __H
+    new_mem_node1->name = append_string(node->name, "__S");
+    new_mem_node2->name = append_string(node->name, "__H");
+
+    /* Copy properties from the original memory node */
+    new_mem_node1->type = node->type;
+    new_mem_node1->related_ast_node = node->related_ast_node;
+    new_mem_node1->traverse_visited = node->traverse_visited;
+    new_mem_node2->type = node->type;
+    new_mem_node2->related_ast_node = node->related_ast_node;
+    new_mem_node2->traverse_visited = node->traverse_visited;
+
+    // Move over the original pins to the first memory node.
+    signal_list_t *clk = init_signal_list();
+    add_pin_to_signal_list(clk, signals->clk);
+    remap_input_port_to_memory(new_mem_node1, new_addr1, "addr1");
+    remap_input_port_to_memory(new_mem_node1, new_addr2, "addr2");
+    remap_input_port_to_memory(new_mem_node1, signals->data1, "data1");
+    remap_input_port_to_memory(new_mem_node1, signals->data2, "data2");
+
+    // Copy the inputs to the second memory node.
+    copy_input_port_to_memory(new_mem_node2, new_addr1, "addr1");
+    copy_input_port_to_memory(new_mem_node2, new_addr2, "addr2");
+    copy_input_port_to_memory(new_mem_node2, signals->data1, "data1");
+    copy_input_port_to_memory(new_mem_node2, signals->data2, "data2");
+
+    // Hook up addresses and write enables.
+    {
+        signal_list_t *we;
+        nnode_t *and_node = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        remap_pin_to_new_node(signals->we1, and_node, 1);
+        remap_pin_to_new_node(signals->addr1->pins[0], and_node, 0);
+
+        we = make_output_pins_for_existing_node(and_node, 1);
+        add_input_port_to_memory(new_mem_node1, we, "we1");
+        free_signal_list(we);
+
+        and_node = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        remap_pin_to_new_node(signals->we2, and_node, 1);
+        remap_pin_to_new_node(signals->addr2->pins[0], and_node, 0);
+
+        we = make_output_pins_for_existing_node(and_node, 1);
+        add_input_port_to_memory(new_mem_node1, we, "we2");
+        free_signal_list(we);
+
+        nnode_t *not_g = make_not_gate_with_input(copy_input_npin(signals->addr1->pins[0]), new_mem_node2, new_mem_node2->traverse_visited);
+        and_node = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node2, new_mem_node2->traverse_visited);
+        connect_nodes(not_g, 0, and_node, 0);
+        add_input_pin_to_node(and_node, copy_input_npin(signals->we1), 1);
+
+        we = make_output_pins_for_existing_node(and_node, 1);
+        add_input_port_to_memory(new_mem_node2, we, "we1");
+        free_signal_list(we);
+
+        not_g = make_not_gate_with_input(copy_input_npin(signals->addr2->pins[0]), new_mem_node2, new_mem_node2->traverse_visited);
+        and_node = make_2port_gate(LOGICAL_AND, 1, 1, 1, new_mem_node2, new_mem_node2->traverse_visited);
+        connect_nodes(not_g, 0, and_node, 0);
+
+        add_input_pin_to_node(and_node, copy_input_npin(signals->we2), 1);
+
+        we = make_output_pins_for_existing_node(and_node, 1);
+        add_input_port_to_memory(new_mem_node2, we, "we2");
+        free_signal_list(we);
+    }
+
+    // Add the clock signals.
+    remap_input_port_to_memory(new_mem_node1, clk, "clk");
+    copy_input_port_to_memory(new_mem_node2, clk, "clk");
+    free_signal_list(clk);
+
+    // Setup output ports on both nodes.
+    allocate_more_output_pins(new_mem_node1, signals->out1->count + signals->out2->count);
+    add_output_port_information(new_mem_node1, signals->out1->count);
+    add_output_port_information(new_mem_node1, signals->out2->count);
+
+    allocate_more_output_pins(new_mem_node2, signals->out1->count + signals->out2->count);
+    add_output_port_information(new_mem_node2, signals->out1->count);
+    add_output_port_information(new_mem_node2, signals->out2->count);
+
+    /* Copy over the output pins for the new memory */
+    for (int i = 0; i < signals->data1->count; i++) {
+        nnode_t *mux = make_2port_gate(MUX_2, 2, 2, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        nnode_t *not_g = make_not_gate(new_mem_node1, new_mem_node1->traverse_visited);
+        add_input_pin_to_node(mux, copy_input_npin(signals->addr1->pins[0]), 0);
+        add_input_pin_to_node(not_g, copy_input_npin(signals->addr1->pins[0]), 0);
+        connect_nodes(not_g, 0, mux, 1);
+
+        npin_t *pin = signals->out1->pins[i];
+        if (pin->name) {
+            vtr::free(pin->name);
+        }
+        pin->name = mux->name;
+        if (pin->mapping) {
+            vtr::free(pin->mapping);
+        }
+        pin->mapping = NULL;
+
+        remap_pin_to_new_node(pin, mux, 0);
+
+        connect_nodes(new_mem_node1, i, mux, 2);
+        if (new_mem_node1->output_pins[i]->mapping) {
+            vtr::free(new_mem_node1->output_pins[i]->mapping);
+        }
+        new_mem_node1->output_pins[i]->mapping = vtr::strdup("out1");
+
+        connect_nodes(new_mem_node2, i, mux, 3);
+        if (new_mem_node2->output_pins[i]->mapping) {
+            vtr::free(new_mem_node2->output_pins[i]->mapping);
+        }
+        new_mem_node2->output_pins[i]->mapping = vtr::strdup("out1");
+    }
+
+    /* Copy over the output pins for the new memory */
+    for (int i = 0; i < signals->data1->count; i++) {
+        nnode_t *mux = make_2port_gate(MUX_2, 2, 2, 1, new_mem_node1, new_mem_node1->traverse_visited);
+        nnode_t *not_g = make_not_gate(new_mem_node1, new_mem_node1->traverse_visited);
+        add_input_pin_to_node(mux, copy_input_npin(signals->addr2->pins[0]), 0);
+        add_input_pin_to_node(not_g, copy_input_npin(signals->addr2->pins[0]), 0);
+        connect_nodes(not_g, 0, mux, 1);
+
+        int pin_index = new_mem_node1->output_port_sizes[0] + i;
+
+        npin_t *pin = signals->out2->pins[i];
+        if (pin->name) {
+            vtr::free(pin->name);
+        }
+        pin->name = mux->name;
+        if (pin->mapping) {
+            vtr::free(pin->mapping);
+        }
+        pin->mapping = NULL;
+
+        remap_pin_to_new_node(pin, mux, 0);
+
+        connect_nodes(new_mem_node1, pin_index, mux, 2);
+        if (new_mem_node1->output_pins[pin_index]->mapping) {
+            vtr::free(new_mem_node1->output_pins[pin_index]->mapping);
+        }
+        new_mem_node1->output_pins[pin_index]->mapping = vtr::strdup("out2");
+
+        connect_nodes(new_mem_node2, pin_index, mux, 3);
+        if (new_mem_node2->output_pins[pin_index]->mapping) {
+            vtr::free(new_mem_node2->output_pins[pin_index]->mapping);
+        }
+        new_mem_node2->output_pins[pin_index]->mapping = vtr::strdup("out2");
+    }
+
+    free_dp_ram_signals(signals);
+    free_signal_list(new_addr1);
+    free_signal_list(new_addr2);
+    free_nnode(node);
+
+    split_dp_memory_depth(new_mem_node1, split_size);
+    split_dp_memory_depth(new_mem_node2, split_size);
+}
+
+/*
+ * Width-splits the given memory up into chunks the of the
+ * width specified in the arch file.
+ */
+void split_sp_memory_width(nnode_t *node, int target_size)
+{
+    char port_name[] = "data";
+    int data_port_number = get_input_port_index_from_mapping(node, port_name);
+
+    oassert(data_port_number != -1);
+
+    int data_port_size = node->input_port_sizes[data_port_number];
+
+    int num_memories = ceil((double)data_port_size / (double)target_size);
+
+    if (data_port_size <= target_size) {
+        // If we don't need to split, put the original node back.
+        sp_memory_list = insert_in_vptr_list(sp_memory_list, node);
+    } else {
+        int data_pins_moved = 0;
+        int output_pins_moved = 0;
+        for (int i = 0; i < num_memories; i++) {
+            nnode_t *new_node = allocate_nnode(node->loc);
+            new_node->name = append_string(node->name, "-%d", i);
+            sp_memory_list = insert_in_vptr_list(sp_memory_list, new_node);
+
+            /* Copy properties from the original node */
+            new_node->type = node->type;
+            new_node->related_ast_node = node->related_ast_node;
+            new_node->traverse_visited = node->traverse_visited;
+            new_node->node_data = NULL;
+
+            for (int j = 0; j < node->num_input_port_sizes; j++)
+                add_input_port_information(new_node, 0);
+
+            add_output_port_information(new_node, 0);
+
+            int index = 0;
+            int old_index = 0;
+            for (int j = 0; j < node->num_input_port_sizes; j++) {
+                // Move this node's share of data pins out of the data port of the original node.
+                if (j == data_port_number) {
+                    // Skip over data pins we've already moved.
+                    old_index += data_pins_moved;
+                    for (int k = 0; k < target_size && data_pins_moved < data_port_size; k++) {
+                        allocate_more_input_pins(new_node, 1);
+                        new_node->input_port_sizes[j]++;
+                        remap_pin_to_new_node(node->input_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                        data_pins_moved++;
+                    }
+                    int remaining_data_pins = data_port_size - data_pins_moved;
+                    // Skip over pins we have yet to copy.
+                    old_index += remaining_data_pins;
+                } else {
+                    for (int k = 0; k < node->input_port_sizes[j]; k++) {
+                        allocate_more_input_pins(new_node, 1);
+                        new_node->input_port_sizes[j]++;
+                        // Copy pins for all but the last memory. the last one get the original pins moved to it.
+                        if (i < num_memories - 1)
+                            add_input_pin_to_node(new_node, copy_input_npin(node->input_pins[old_index]), index);
+                        else
+                            remap_pin_to_new_node(node->input_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                    }
+                }
+            }
+
+            index = 0;
+            old_index = 0;
+            old_index += output_pins_moved;
+
+            for (int k = 0; k < target_size && output_pins_moved < data_port_size; k++) {
+                allocate_more_output_pins(new_node, 1);
+                new_node->output_port_sizes[0]++;
+                remap_pin_to_new_node(node->output_pins[old_index], new_node, index);
+                index++;
+                old_index++;
+                output_pins_moved++;
+            }
+        }
+        // Free the original node.
+        free_nnode(node);
+    }
+}
+
+/*
+ * Splits the given dual port memory width into one or more memories with
+ * width less than or equal to target_size.
+ */
+void split_dp_memory_width(nnode_t *node, int target_size)
+{
+    char data1_name[] = "data1";
+    char data2_name[] = "data2";
+    char out1_name[] = "out1";
+    char out2_name[] = "out2";
+
+    int data1_port_number = get_input_port_index_from_mapping(node, data1_name);
+    int data2_port_number = get_input_port_index_from_mapping(node, data2_name);
+
+    int out1_port_number = get_output_port_index_from_mapping(node, out1_name);
+    int out2_port_number = get_output_port_index_from_mapping(node, out2_name);
+
+    oassert(data1_port_number != -1);
+    oassert(data2_port_number != -1);
+    oassert(out1_port_number != -1);
+    oassert(out2_port_number != -1);
+
+    int data1_port_size = node->input_port_sizes[data1_port_number];
+    int data2_port_size = node->input_port_sizes[data2_port_number];
+
+    int out1_port_size = node->output_port_sizes[out1_port_number];
+    int out2_port_size = node->output_port_sizes[out2_port_number];
+
+    oassert(data1_port_size == data2_port_size);
+    oassert(out1_port_size == out2_port_size);
+    oassert(data1_port_size == out1_port_size);
+
+    int num_memories = ceil((double)data1_port_size / (double)target_size);
+
+    if (data1_port_size <= target_size) {
+        // If we're not splitting, put the original memory node back.
+        dp_memory_list = insert_in_vptr_list(dp_memory_list, node);
+    } else {
+        int data1_pins_moved = 0;
+        int data2_pins_moved = 0;
+        int out1_pins_moved = 0;
+        int out2_pins_moved = 0;
+        for (int i = 0; i < num_memories; i++) {
+            nnode_t *new_node = allocate_nnode(node->loc);
+            new_node->name = append_string(node->name, "-%d", i);
+            dp_memory_list = insert_in_vptr_list(dp_memory_list, new_node);
+
+            /* Copy properties from the original node */
+            new_node->type = node->type;
+            new_node->related_ast_node = node->related_ast_node;
+            new_node->traverse_visited = node->traverse_visited;
+            new_node->node_data = NULL;
+
+            for (int j = 0; j < node->num_input_port_sizes; j++)
+                add_input_port_information(new_node, 0);
+
+            int index = 0;
+            int old_index = 0;
+            for (int j = 0; j < node->num_input_port_sizes; j++) {
+                // Move this node's share of data pins out of the data port of the original node.
+                if (j == data1_port_number) {
+                    // Skip over data pins we've already moved.
+                    old_index += data1_pins_moved;
+                    for (int k = 0; k < target_size && data1_pins_moved < data1_port_size; k++) {
+                        allocate_more_input_pins(new_node, 1);
+                        new_node->input_port_sizes[j]++;
+                        remap_pin_to_new_node(node->input_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                        data1_pins_moved++;
+                    }
+                    int remaining_data_pins = data1_port_size - data1_pins_moved;
+                    // Skip over pins we have yet to copy.
+                    old_index += remaining_data_pins;
+                } else if (j == data2_port_number) {
+                    // Skip over data pins we've already moved.
+                    old_index += data2_pins_moved;
+                    for (int k = 0; k < target_size && data2_pins_moved < data2_port_size; k++) {
+                        allocate_more_input_pins(new_node, 1);
+                        new_node->input_port_sizes[j]++;
+                        remap_pin_to_new_node(node->input_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                        data2_pins_moved++;
+                    }
+                    int remaining_data_pins = data2_port_size - data2_pins_moved;
+                    // Skip over pins we have yet to copy.
+                    old_index += remaining_data_pins;
+                } else {
+                    for (int k = 0; k < node->input_port_sizes[j]; k++) {
+                        allocate_more_input_pins(new_node, 1);
+                        new_node->input_port_sizes[j]++;
+                        // Copy pins for all but the last memory. the last one get the original pins moved to it.
+                        if (i < num_memories - 1)
+                            add_input_pin_to_node(new_node, copy_input_npin(node->input_pins[old_index]), index);
+                        else
+                            remap_pin_to_new_node(node->input_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                    }
+                }
+            }
+
+            for (int j = 0; j < node->num_output_port_sizes; j++)
+                add_output_port_information(new_node, 0);
+
+            index = 0;
+            old_index = 0;
+            for (int j = 0; j < node->num_output_port_sizes; j++) {
+                // Move this node's share of data pins out of the data port of the original node.
+                if (j == out1_port_number) {
+                    // Skip over data pins we've already moved.
+                    old_index += out1_pins_moved;
+                    for (int k = 0; k < target_size && out1_pins_moved < out1_port_size; k++) {
+                        allocate_more_output_pins(new_node, 1);
+                        new_node->output_port_sizes[j]++;
+                        remap_pin_to_new_node(node->output_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                        out1_pins_moved++;
+                    }
+                    int remaining_pins = out1_port_size - out1_pins_moved;
+                    // Skip over pins we have yet to copy.
+                    old_index += remaining_pins;
+                } else if (j == out2_port_number) {
+                    // Skip over data pins we've already moved.
+                    old_index += out2_pins_moved;
+                    for (int k = 0; k < target_size && out2_pins_moved < out2_port_size; k++) {
+                        allocate_more_output_pins(new_node, 1);
+                        new_node->output_port_sizes[j]++;
+                        remap_pin_to_new_node(node->output_pins[old_index], new_node, index);
+                        index++;
+                        old_index++;
+                        out2_pins_moved++;
+                    }
+                    int remaining_pins = out2_port_size - out2_pins_moved;
+                    // Skip over pins we have yet to copy.
+                    old_index += remaining_pins;
+                } else {
+                    oassert(false);
+                }
+            }
+        }
+        // Free the original node.
+        free_nnode(node);
+    }
+}
+
+/*
+ * Determines the single port ram split depth based on the configuration
+ * variables and architecture.
+ */
+long get_sp_ram_split_depth()
+{
+    t_model_ports *hb_ports = get_model_port(single_port_rams->inputs, "addr");
+    long split_size;
+    if (configuration.split_memory_depth == -1) /* MIN */
+        split_size = hb_ports->min_size;
+    else if (configuration.split_memory_depth == -2) /* MIN */
+        split_size = hb_ports->size;
+    else if (configuration.split_memory_depth > 0)
+        split_size = configuration.split_memory_depth;
+    else
+        split_size = hb_ports->size;
+
+    oassert(split_size > 0);
+
+    return split_size;
+}
+
+/*
+ * Determines the dual port ram split depth based on the configuration
+ * variables and architecture.
+ */
+long get_dp_ram_split_depth()
+{
+    t_model_ports *hb_ports = get_model_port(dual_port_rams->inputs, "addr1");
+    long split_depth;
+    if (configuration.split_memory_depth == -1) /* MIN */
+        split_depth = hb_ports->min_size;
+    else if (configuration.split_memory_depth == -2) /* MIN */
+        split_depth = hb_ports->size;
+    else if (configuration.split_memory_depth > 0)
+        split_depth = configuration.split_memory_depth;
+    else
+        split_depth = hb_ports->size;
+
+    oassert(split_depth > 0);
+
+    return split_depth;
+}
+
+/*
+ * Determines the single port ram split depth based on the configuration
+ * variables and architecture.
+ */
+int get_sp_ram_split_width()
+{
+    if (configuration.split_memory_width) {
+        return 1;
+    } else {
+        t_model *model = single_port_rams;
+        char port_name[] = "data";
+        t_model_ports *ports = get_model_port(model->inputs, port_name);
+        return ports->size;
+    }
+}
+
+/*
+ * Determines the dual port ram split depth based on the configuration
+ * variables and architecture.
+ */
+int get_dp_ram_split_width()
+{
+    if (configuration.split_memory_width) {
+        return 1;
+    } else {
+        t_model *model = dual_port_rams;
+        char port_name[] = "data1";
+        t_model_ports *ports = get_model_port(model->inputs, port_name);
+        return ports->size;
+    }
+}
+
+/*
+ * Removes all memories from the sp_memory_list and dp_memory_list which do not
+ * have more than configuration.soft_logic_memory_depth_threshold address bits.
+ */
+void filter_memories_by_soft_logic_cutoff()
+{
+    if (single_port_rams) {
+        t_linked_vptr *temp = sp_memory_list;
+        sp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+
+            long depth = get_sp_ram_depth(node);
+            long width = get_sp_ram_width(node);
+            if (depth > configuration.soft_logic_memory_depth_threshold || width > configuration.soft_logic_memory_width_threshold)
+                sp_memory_list = insert_in_vptr_list(sp_memory_list, node);
+        }
+    }
+
+    if (dual_port_rams) {
+        t_linked_vptr *temp = dp_memory_list;
+        dp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+
+            long depth = get_dp_ram_depth(node);
+            long width = get_dp_ram_width(node);
+            if (depth > configuration.soft_logic_memory_depth_threshold || width > configuration.soft_logic_memory_width_threshold)
+                dp_memory_list = insert_in_vptr_list(dp_memory_list, node);
+        }
+    }
+}
+
+/*-------------------------------------------------------------------------
+ * (function: iterate_memories)
+ *
+ * This function will iterate over all of the memory hard blocks that
+ *      exist in the netlist and perform a splitting so that they can
+ *      be easily packed into hard memory blocks on the FPGA.
+ *
+ * This function will drop memories which fall below the soft logic threshold,
+ * if those configuration variables are set.
+ *-----------------------------------------------------------------------*/
+void iterate_memories(netlist_t *netlist)
+{
+    /* Report on Logical Memory usage */
+    check_memories_and_report_distribution();
+
+    // Remove memories that don't meet the soft logic cutoff.
+    filter_memories_by_soft_logic_cutoff();
+
+    if (single_port_rams) {
+        // Depth split
+        int split_depth = get_sp_ram_split_depth();
+        t_linked_vptr *temp = sp_memory_list;
+        sp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            split_sp_memory_depth(node, split_depth);
+        }
+
+        // Width split
+        int split_width = get_sp_ram_split_width();
+        temp = sp_memory_list;
+        sp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            split_sp_memory_width(node, split_width);
+        }
+
+        // Remove memories that are too small to use hard blocks.
+        filter_memories_by_soft_logic_cutoff();
+
+        // Pad the rest.
+        temp = sp_memory_list;
+        sp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            pad_sp_memory_width(node, netlist);
+            pad_memory_input_port(node, netlist, single_port_rams, "addr");
+        }
+    }
+
+    if (dual_port_rams) {
+        // Depth split
+        int split_depth = get_dp_ram_split_depth();
+        t_linked_vptr *temp = dp_memory_list;
+        dp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            split_dp_memory_depth(node, split_depth);
+        }
+
+        // Width split
+        int split_width = get_dp_ram_split_width();
+        temp = dp_memory_list;
+        dp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            split_dp_memory_width(node, split_width);
+        }
+
+        // Remove memories that are too small to use hard blocks.
+        filter_memories_by_soft_logic_cutoff();
+
+        // Pad the rest
+        temp = dp_memory_list;
+        dp_memory_list = NULL;
+        while (temp != NULL) {
+            nnode_t *node = (nnode_t *)temp->data_vptr;
+            oassert(node != NULL);
+            oassert(node->type == MEMORY);
+            temp = delete_in_vptr_list(temp);
+            pad_dp_memory_width(node, netlist);
+            pad_memory_input_port(node, netlist, dual_port_rams, "addr1");
+            pad_memory_input_port(node, netlist, dual_port_rams, "addr2");
+        }
+    }
+}
+
+/*-------------------------------------------------------------------------
+ * (function: free_memory_lists)
+ *
+ * Clean up the memory by deleting the list structure of memories
+ *      during optimisation.
+ *-----------------------------------------------------------------------*/
+void free_memory_lists()
+{
+    while (sp_memory_list != NULL)
+        sp_memory_list = delete_in_vptr_list(sp_memory_list);
+    while (dp_memory_list != NULL)
+        dp_memory_list = delete_in_vptr_list(dp_memory_list);
+}
+
+/*
+ * Pads the width of a dual port memory to that specified in the arch file.
+ */
+void pad_dp_memory_width(nnode_t *node, netlist_t *netlist)
+{
+    oassert(node->type == MEMORY);
+    oassert(dual_port_rams != NULL);
+
+    pad_memory_input_port(node, netlist, dual_port_rams, "data1");
+    pad_memory_input_port(node, netlist, dual_port_rams, "data2");
+
+    pad_memory_output_port(node, netlist, dual_port_rams, "out1");
+    pad_memory_output_port(node, netlist, dual_port_rams, "out2");
+
+    dp_memory_list = insert_in_vptr_list(dp_memory_list, node);
+}
+
+/*
+ * Pads the width of a single port memory to that specified in the arch file.
+ */
+void pad_sp_memory_width(nnode_t *node, netlist_t *netlist)
+{
+    oassert(node->type == MEMORY);
+    oassert(single_port_rams != NULL);
+
+    pad_memory_input_port(node, netlist, single_port_rams, "data");
+
+    pad_memory_output_port(node, netlist, single_port_rams, "out");
+
+    sp_memory_list = insert_in_vptr_list(sp_memory_list, node);
+}
+
+/*
+ * Pads the given output port to the width specified in the given model.
+ */
+void pad_memory_output_port(nnode_t *node, netlist_t * /*netlist*/, t_model *model, const char *port_name)
+{
+    static int pad_pin_number = 0;
+
+    int port_number = get_output_port_index_from_mapping(node, port_name);
+    int port_index = get_output_pin_index_from_mapping(node, port_name);
+
+    int port_size = node->output_port_sizes[port_number];
+
+    t_model_ports *ports = get_model_port(model->outputs, port_name);
+
+    oassert(ports != NULL);
+
+    int target_size = ports->size;
+    int diff = target_size - port_size;
+
+    if (diff > 0) {
+        allocate_more_output_pins(node, diff);
+
+        // Shift other pins to the right, if any.
+        for (int i = node->num_output_pins - 1; i >= port_index + target_size; i--)
+            move_output_pin(node, i - diff, i);
+
+        for (int i = port_index + port_size; i < port_index + target_size; i++) {
+            // Add new pins to the higher order spots.
+            npin_t *new_pin = allocate_npin();
+            // Pad outputs with a unique and descriptive name to avoid collisions.
+            new_pin->name = append_string("", "unconnected_memory_output~%d", pad_pin_number++);
+            new_pin->mapping = vtr::strdup(port_name);
+            add_output_pin_to_node(node, new_pin, i);
+        }
+        node->output_port_sizes[port_number] = target_size;
+    }
+}
+
+/*
+ * Pads the given input port to the width specified in the given model.
+ */
+void pad_memory_input_port(nnode_t *node, netlist_t *netlist, t_model *model, const char *port_name)
+{
+    oassert(node->type == MEMORY);
+    oassert(model != NULL);
+
+    int port_number = get_input_port_index_from_mapping(node, port_name);
+    int port_index = get_input_pin_index_from_mapping(node, port_name);
+
+    oassert(port_number != -1);
+    oassert(port_index != -1);
+
+    int port_size = node->input_port_sizes[port_number];
+
+    t_model_ports *ports = get_model_port(model->inputs, port_name);
+
+    oassert(ports != NULL);
+
+    int target_size = ports->size;
+    int diff = target_size - port_size;
+
+    // Expand the inputs
+    if (diff > 0) {
+        allocate_more_input_pins(node, diff);
+
+        // Shift other pins to the right, if any.
+        for (int i = node->num_input_pins - 1; i >= port_index + target_size; i--)
+            move_input_pin(node, i - diff, i);
+
+        for (int i = port_index + port_size; i < port_index + target_size; i++) {
+            add_input_pin_to_node(node, get_pad_pin(netlist), i);
+            if (node->input_pins[i]->mapping) {
+                vtr::free(node->input_pins[i]->mapping);
+            }
+            node->input_pins[i]->mapping = vtr::strdup(port_name);
+        }
+
+        node->input_port_sizes[port_number] = target_size;
+    }
+}
+
+bool is_sp_ram(nnode_t *node)
+{
+    oassert(node != NULL);
+    oassert(node->type == MEMORY);
+    return !strcmp(node->related_ast_node->identifier_node->types.identifier, SINGLE_PORT_RAM_string);
+}
+
+bool is_dp_ram(nnode_t *node)
+{
+    oassert(node != NULL);
+    oassert(node->type == MEMORY);
+    return !strcmp(node->related_ast_node->identifier_node->types.identifier, DUAL_PORT_RAM_string);
+}
+
+/**
+ * (function: is_blif_sp_ram)
+ *
+ * @brief to check if the given node is a valid
+ * single port ram based on VTR primitive definition
+ *
+ * @param node pointing to a spram node
+ */
+bool is_blif_sp_ram(nnode_t *node)
+{
+    oassert(node->name);
+    /* return value */
+    bool is_ram = true;
+    if (std::string(node->name).find(SINGLE_PORT_RAM_string) == std::string::npos)
+        return false;
+
+    /* check the num input/output ports */
+    is_ram = (node->num_input_port_sizes == 4) && (node->num_output_port_sizes == 1);
+
+    /* check if it is a ram */
+    if (is_ram) {
+        /* port connections were passed by name; verify input port names */
+        for (int i = 0; i < node->num_input_pins && is_ram; i++) {
+            oassert(node->input_pins[i]->mapping);
+            char *port_id = node->input_pins[i]->mapping;
+
+            /* comparision of mapping ports with single_port_ram port names */
+            if ((strcmp(port_id, "we") != 0) && (strcmp(port_id, "clk") != 0) && (strcmp(port_id, "addr") != 0) && (strcmp(port_id, "data") != 0)) {
+                is_ram = false;
+                break;
+            }
+        }
+        /* port connections were passed by name; verify output port names */
+        for (int i = 0; i < node->num_output_pins && is_ram; i++) {
+            oassert(node->output_pins[i]->mapping);
+            char *port_id = node->output_pins[i]->mapping;
+
+            /* comparision of mapping ports with single_port_ram port names */
+            if ((strcmp(port_id, "out") != 0)) {
+                is_ram = false;
+                break;
+            }
+        }
+    }
+
+    return (is_ram);
+}
+
+/**
+ * (function: is_blif_dp_ram)
+ *
+ * @brief to check if the given node is a valid
+ * dual port ram based on VTR primitive definition
+ *
+ * @param node pointing to a dpram node
+ */
+bool is_blif_dp_ram(nnode_t *node)
+{
+    oassert(node->name);
+    /* return value */
+    bool is_ram = true;
+    if (std::string(node->name).find(DUAL_PORT_RAM_string) == std::string::npos)
+        return false;
+
+    /* check the num input/output ports */
+    is_ram = (node->num_input_port_sizes == 7) && (node->num_output_port_sizes == 2);
+
+    /* check if it is a ram */
+    if (is_ram) {
+        /* port connections were passed by name; verify input port names */
+        for (int i = 0; i < node->num_input_pins && is_ram; i++) {
+            oassert(node->input_pins[i]->mapping);
+            char *port_id = node->input_pins[i]->mapping;
+
+            /* comparision of mapping ports with dual_port_ram port names */
+            if ((strcmp(port_id, "clk") != 0) && (strcmp(port_id, "we1") != 0) && (strcmp(port_id, "we2") != 0) && (strcmp(port_id, "addr1") != 0) &&
+                (strcmp(port_id, "addr2") != 0) && (strcmp(port_id, "data1") != 0) && (strcmp(port_id, "data2") != 0)) {
+                is_ram = false;
+                break;
+            }
+        }
+        /* port connections were passed by name; verify output port names */
+        for (int i = 0; i < node->num_output_pins && is_ram; i++) {
+            oassert(node->output_pins[i]->mapping);
+            char *port_id = node->output_pins[i]->mapping;
+
+            /* comparision of mapping ports with dual_port_ram port names */
+            if ((strcmp(port_id, "out1") != 0) && (strcmp(port_id, "out2") != 0)) {
+                is_ram = false;
+                break;
+            }
+        }
+    }
+
+    return (is_ram);
+}
+
+sp_ram_signals *get_sp_ram_signals(nnode_t *node)
+{
+    oassert(is_sp_ram(node));
+
+    ast_node_t *ast_node = node->related_ast_node;
+    sp_ram_signals *signals = (sp_ram_signals *)vtr::malloc(sizeof(sp_ram_signals));
+
+    // Separate the input signals according to their mapping.
+    signals->addr = init_signal_list();
+    signals->data = init_signal_list();
+    signals->out = init_signal_list();
+    signals->we = NULL;
+    signals->clk = NULL;
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        npin_t *pin = node->input_pins[i];
+        if (!strcmp(pin->mapping, "addr"))
+            add_pin_to_signal_list(signals->addr, pin);
+        else if (!strcmp(pin->mapping, "data"))
+            add_pin_to_signal_list(signals->data, pin);
+        else if (!strcmp(pin->mapping, "we"))
+            signals->we = pin;
+        else if (!strcmp(pin->mapping, "clk"))
+            signals->clk = pin;
+        else
+            error_message(NETLIST, ast_node->loc, "Unexpected input pin mapping \"%s\" on memory node: %s\n", pin->mapping, node->name);
+    }
+
+    oassert(signals->clk != NULL);
+    oassert(signals->we != NULL);
+    oassert(signals->addr->count >= 1);
+    oassert(signals->data->count >= 1);
+    oassert(signals->data->count == node->num_output_pins);
+
+    for (int i = 0; i < node->num_output_pins; i++) {
+        npin_t *pin = node->output_pins[i];
+        if (!strcmp(pin->mapping, "out"))
+            add_pin_to_signal_list(signals->out, pin);
+        else
+            error_message(NETLIST, ast_node->loc, "Unexpected output pin mapping \"%s\" on memory node: %s\n", pin->mapping, node->name);
+    }
+
+    oassert(signals->out->count == signals->data->count);
+
+    return signals;
+}
+
+void free_sp_ram_signals(sp_ram_signals *signalsvar)
+{
+    free_signal_list(signalsvar->data);
+    free_signal_list(signalsvar->addr);
+    free_signal_list(signalsvar->out);
+
+    vtr::free(signalsvar);
+}
+
+dp_ram_signals *get_dp_ram_signals(nnode_t *node)
+{
+    oassert(is_dp_ram(node));
+
+    ast_node_t *ast_node = node->related_ast_node;
+    dp_ram_signals *signals = (dp_ram_signals *)vtr::malloc(sizeof(dp_ram_signals));
+
+    // Separate the input signals according to their mapping.
+    signals->addr1 = init_signal_list();
+    signals->addr2 = init_signal_list();
+    signals->data1 = init_signal_list();
+    signals->data2 = init_signal_list();
+    signals->out1 = init_signal_list();
+    signals->out2 = init_signal_list();
+    signals->we1 = NULL;
+    signals->we2 = NULL;
+    signals->clk = NULL;
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        npin_t *pin = node->input_pins[i];
+        if (!strcmp(pin->mapping, "addr1"))
+            add_pin_to_signal_list(signals->addr1, pin);
+        else if (!strcmp(pin->mapping, "addr2"))
+            add_pin_to_signal_list(signals->addr2, pin);
+        else if (!strcmp(pin->mapping, "data1"))
+            add_pin_to_signal_list(signals->data1, pin);
+        else if (!strcmp(pin->mapping, "data2"))
+            add_pin_to_signal_list(signals->data2, pin);
+        else if (!strcmp(pin->mapping, "we1"))
+            signals->we1 = pin;
+        else if (!strcmp(pin->mapping, "we2"))
+            signals->we2 = pin;
+        else if (!strcmp(pin->mapping, "clk"))
+            signals->clk = pin;
+        else
+            error_message(NETLIST, ast_node->loc, "Unexpected input pin mapping \"%s\" on memory node: %s\n", pin->mapping, node->name);
+    }
+
+    // Sanity checks.
+    oassert(signals->clk != NULL);
+    oassert(signals->we1 != NULL && signals->we2 != NULL);
+    oassert(signals->addr1->count >= 1 && signals->data1->count >= 1);
+    oassert(signals->addr2->count >= 1 && signals->data2->count >= 1);
+    oassert(signals->addr1->count == signals->addr2->count);
+    oassert(signals->data1->count == signals->data2->count);
+    oassert(signals->data1->count + signals->data2->count == node->num_output_pins);
+
+    // Separate output signals according to mapping.
+    for (int i = 0; i < node->num_output_pins; i++) {
+        npin_t *pin = node->output_pins[i];
+        if (!strcmp(pin->mapping, "out1"))
+            add_pin_to_signal_list(signals->out1, pin);
+        else if (!strcmp(pin->mapping, "out2"))
+            add_pin_to_signal_list(signals->out2, pin);
+        else
+            error_message(NETLIST, ast_node->loc, "Unexpected output pin mapping \"%s\" on memory node: %s\n", pin->mapping, node->name);
+    }
+
+    oassert(signals->out1->count == signals->out2->count);
+    oassert(signals->out1->count == signals->data1->count);
+
+    return signals;
+}
+
+void free_dp_ram_signals(dp_ram_signals *signalsvar)
+{
+    free_signal_list(signalsvar->data1);
+    free_signal_list(signalsvar->data2);
+    free_signal_list(signalsvar->addr1);
+    free_signal_list(signalsvar->addr2);
+    free_signal_list(signalsvar->out1);
+    free_signal_list(signalsvar->out2);
+
+    vtr::free(signalsvar);
+}
+
+/*
+ * Expands the given single port ram block into soft logic.
+ */
+void instantiate_soft_single_port_ram(nnode_t *node, short mark, netlist_t *netlist)
+{
+    oassert(is_sp_ram(node));
+
+    sp_ram_signals *signals = get_sp_ram_signals(node);
+
+    // Construct an address decoder.
+    signal_list_t *decoder = create_decoder(node, mark, signals->addr, netlist);
+
+    // The total number of memory addresses. (2^address_bits)
+    long num_addr = decoder->count;
+
+    nnode_t **and_gates = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * num_addr);
+
+    for (long i = 0; i < num_addr; i++) {
+        npin_t *address_pin = decoder->pins[i];
+        /* Check that the input pin is driven */
+        oassert(address_pin->net->num_driver_pins || address_pin->net == netlist->zero_net || address_pin->net == netlist->one_net ||
+                address_pin->net == netlist->pad_net);
+
+        // An AND gate to enable and disable writing.
+        nnode_t *and_g = make_1port_logic_gate(LOGICAL_AND, 2, node, mark);
+        add_input_pin_to_node(and_g, address_pin, 0);
+
+        if (!i)
+            remap_pin_to_new_node(signals->we, and_g, 1);
+        else
+            add_input_pin_to_node(and_g, copy_input_npin(signals->we), 1);
+
+        and_gates[i] = and_g;
+    }
+
+    for (long i = 0; i < signals->data->count; i++) {
+        npin_t *data_pin = signals->data->pins[i];
+
+        // The output multiplexer determines which memory cell is connected to the output register.
+        nnode_t *output_mux = make_2port_gate(MULTI_PORT_MUX, num_addr, num_addr, 1, node, mark);
+
+        for (int j = 0; j < num_addr; j++) {
+            npin_t *address_pin = decoder->pins[j];
+            /* Check that the input pin is driven */
+            oassert(address_pin->net->num_driver_pins || address_pin->net == netlist->zero_net || address_pin->net == netlist->one_net ||
+                    address_pin->net == netlist->pad_net);
+
+            // A multiplexer switches between accepting incoming data and keeping existing data.
+            nnode_t *mux = make_2port_gate(MUX_2, 2, 2, 1, node, mark);
+            nnode_t *not_g = make_not_gate(node, mark);
+            connect_nodes(and_gates[j], 0, not_g, 0);
+            connect_nodes(and_gates[j], 0, mux, 0);
+            connect_nodes(not_g, 0, mux, 1);
+            if (!j)
+                remap_pin_to_new_node(data_pin, mux, 2);
+            else
+                add_input_pin_to_node(mux, copy_input_npin(data_pin), 2);
+
+            // A flipflop holds the value of each memory cell.
+            nnode_t *ff = make_2port_gate(FF_NODE, 1, 1, 1, node, mark);
+            connect_nodes(mux, 0, ff, 0);
+            if (!i && !j)
+                remap_pin_to_new_node(signals->clk, ff, 1);
+            else
+                add_input_pin_to_node(ff, copy_input_npin(signals->clk), 1);
+
+            // The output of the flipflop connects back to the multiplexer (to hold the value.)
+            connect_nodes(ff, 0, mux, 3);
+
+            // The flipflop connects to the output multiplexer.
+            connect_nodes(ff, 0, output_mux, num_addr + j);
+
+            // Hook the address pin up to the output mux.
+            add_input_pin_to_node(output_mux, copy_input_npin(address_pin), j);
+            ff->attributes->clk_edge_type = RISING_EDGE_SENSITIVITY;
+        }
+
+        npin_t *output_pin = node->output_pins[i];
+
+        // Make sure the BLIF name comes directly from the MUX.
+        if (output_pin->name)
+            vtr::free(output_pin->name);
+        output_pin->name = NULL;
+
+        remap_pin_to_new_node(output_pin, output_mux, 0);
+        instantiate_multi_port_mux(output_mux, mark, netlist);
+    }
+
+    vtr::free(and_gates);
+
+    // Free signal lists.
+    free_sp_ram_signals(signals);
+    free_signal_list(decoder);
+
+    // Free the original hard block memory.
+    free_nnode(node);
+}
+
+/*
+ * Expands the given dual port ram block into soft logic.
+ */
+void instantiate_soft_dual_port_ram(nnode_t *node, short mark, netlist_t *netlist)
+{
+    oassert(is_dp_ram(node));
+
+    dp_ram_signals *signals = get_dp_ram_signals(node);
+
+    // Construct the address decoders.
+    signal_list_t *decoder1 = create_decoder(node, mark, signals->addr1, netlist);
+    signal_list_t *decoder2 = create_decoder(node, mark, signals->addr2, netlist);
+
+    oassert(decoder1->count == decoder2->count);
+
+    // The total number of memory addresses. (2^address_bits)
+    int num_addr = decoder1->count;
+    int data_width = signals->data1->count;
+
+    // Arrays of common gates, one per address.
+    nnode_t **and1_gates = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * num_addr);
+    nnode_t **and2_gates = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * num_addr);
+    nnode_t **or_gates = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * num_addr);
+
+    for (int i = 0; i < num_addr; i++) {
+        npin_t *addr1_pin = decoder1->pins[i];
+        npin_t *addr2_pin = decoder2->pins[i];
+
+        oassert(addr1_pin->net->num_driver_pins || addr1_pin->net == netlist->zero_net || addr1_pin->net == netlist->one_net ||
+                addr1_pin->net == netlist->pad_net);
+        oassert(addr2_pin->net->num_driver_pins || addr2_pin->net == netlist->zero_net || addr2_pin->net == netlist->one_net ||
+                addr2_pin->net == netlist->pad_net);
+
+        // Write enable and gate for address 1.
+        nnode_t *and1 = make_1port_logic_gate(LOGICAL_AND, 2, node, mark);
+        add_input_pin_to_node(and1, addr1_pin, 0);
+
+        if (!i)
+            remap_pin_to_new_node(signals->we1, and1, 1);
+        else
+            add_input_pin_to_node(and1, copy_input_npin(signals->we1), 1);
+
+        // Write enable and gate for address 2.
+        nnode_t *and2 = make_1port_logic_gate(LOGICAL_AND, 2, node, mark);
+        add_input_pin_to_node(and2, addr2_pin, 0);
+
+        if (!i)
+            remap_pin_to_new_node(signals->we2, and2, 1);
+        else
+            add_input_pin_to_node(and2, copy_input_npin(signals->we2), 1);
+
+        and1_gates[i] = and1;
+        and2_gates[i] = and2;
+
+        // OR, to enable writing to this address when either port selects it for writing.
+        nnode_t *or_g = make_1port_logic_gate(LOGICAL_OR, 2, node, mark);
+        connect_nodes(and1, 0, or_g, 0);
+        connect_nodes(and2, 0, or_g, 1);
+
+        or_gates[i] = or_g;
+    }
+
+    for (int i = 0; i < data_width; i++) {
+        npin_t *data1_pin = signals->data1->pins[i];
+        npin_t *data2_pin = signals->data2->pins[i];
+
+        // The output multiplexer determines which memory cell is connected to the output register.
+        nnode_t *output_mux1 = make_2port_gate(MULTI_PORT_MUX, num_addr, num_addr, 1, node, mark);
+        nnode_t *output_mux2 = make_2port_gate(MULTI_PORT_MUX, num_addr, num_addr, 1, node, mark);
+
+        for (int j = 0; j < num_addr; j++) {
+            npin_t *addr1_pin = decoder1->pins[j];
+            npin_t *addr2_pin = decoder2->pins[j];
+
+            oassert(addr1_pin->net->num_driver_pins || addr1_pin->net == netlist->zero_net || addr1_pin->net == netlist->one_net ||
+                    addr1_pin->net == netlist->pad_net);
+            oassert(addr2_pin->net->num_driver_pins || addr2_pin->net == netlist->zero_net || addr2_pin->net == netlist->one_net ||
+                    addr2_pin->net == netlist->pad_net);
+
+            // The data mux selects between the two data lines for this address.
+            nnode_t *data_mux = make_2port_gate(MUX_2, 2, 2, 1, node, mark);
+            // Port 2 before 1 to mimic the simulator's behaviour when the addresses are the same.
+            connect_nodes(and2_gates[j], 0, data_mux, 0);
+            connect_nodes(and1_gates[j], 0, data_mux, 1);
+            if (!j)
+                remap_pin_to_new_node(data2_pin, data_mux, 2);
+            else
+                add_input_pin_to_node(data_mux, copy_input_npin(data2_pin), 2);
+            if (!j)
+                remap_pin_to_new_node(data1_pin, data_mux, 3);
+            else
+                add_input_pin_to_node(data_mux, copy_input_npin(data1_pin), 3);
+
+            nnode_t *not_g = make_not_gate(node, mark);
+            connect_nodes(or_gates[j], 0, not_g, 0);
+
+            // A multiplexer switches between accepting incoming data and keeping existing data.
+            nnode_t *mux = make_2port_gate(MUX_2, 2, 2, 1, node, mark);
+            connect_nodes(or_gates[j], 0, mux, 0);
+            connect_nodes(not_g, 0, mux, 1);
+            connect_nodes(data_mux, 0, mux, 2);
+
+            // A flipflop holds the value of each memory cell.
+            nnode_t *ff = make_2port_gate(FF_NODE, 1, 1, 1, node, mark);
+            connect_nodes(mux, 0, ff, 0);
+            if (!i && !j)
+                remap_pin_to_new_node(signals->clk, ff, 1);
+            else
+                add_input_pin_to_node(ff, copy_input_npin(signals->clk), 1);
+
+            // The output of the flipflop connects back to the multiplexer (to hold the value.)
+            connect_nodes(ff, 0, mux, 3);
+
+            // Connect the flipflop to both output muxes.
+            connect_nodes(ff, 0, output_mux1, num_addr + j);
+            connect_nodes(ff, 0, output_mux2, num_addr + j);
+
+            // Connect address lines to the output muxes for this address.
+            add_input_pin_to_node(output_mux1, copy_input_npin(addr1_pin), j);
+            add_input_pin_to_node(output_mux2, copy_input_npin(addr2_pin), j);
+            ff->attributes->clk_edge_type = RISING_EDGE_SENSITIVITY;
+        }
+
+        npin_t *out1_pin = signals->out1->pins[i];
+        npin_t *out2_pin = signals->out2->pins[i];
+
+        // Make sure the BLIF name comes directly from the MUX.
+        if (out1_pin->name)
+            vtr::free(out1_pin->name);
+        out1_pin->name = NULL;
+
+        if (out2_pin->name)
+            vtr::free(out2_pin->name);
+        out2_pin->name = NULL;
+
+        remap_pin_to_new_node(out1_pin, output_mux1, 0);
+        remap_pin_to_new_node(out2_pin, output_mux2, 0);
+
+        // Convert the output muxes to MUX_2 nodes.
+        instantiate_multi_port_mux(output_mux1, mark, netlist);
+        instantiate_multi_port_mux(output_mux2, mark, netlist);
+    }
+
+    vtr::free(and1_gates);
+    vtr::free(and2_gates);
+    vtr::free(or_gates);
+
+    // Free signal lists.
+    free_dp_ram_signals(signals);
+    free_signal_list(decoder1);
+    free_signal_list(decoder2);
+
+    // Free the original hard block memory.
+    free_nnode(node);
+}
+
+/*
+ * Creates an n to 2^n decoder from the input signal list.
+ */
+signal_list_t *create_decoder(nnode_t *node, short mark, signal_list_t *input_list, netlist_t *netlist)
+{
+    long num_inputs = input_list->count;
+    if (num_inputs > SOFT_RAM_ADDR_LIMIT)
+        error_message(
+          NETLIST, node->loc,
+          "Memory %s of depth 2^%ld exceeds ODIN bound of 2^%d.\nMust use an FPGA architecture that contains embedded hard block memories",
+          node->name, num_inputs, SOFT_RAM_ADDR_LIMIT);
+
+    // Number of outputs is 2^num_inputs
+    long num_outputs = shift_left_value_with_overflow_check(0x1, num_inputs, node->loc);
+
+    // Create NOT gates for all inputs and put the outputs in their own signal list.
+    signal_list_t *not_gates = init_signal_list();
+    for (long i = 0; i < num_inputs; i++) {
+        if (!input_list->pins[i]->net->num_driver_pins && input_list->pins[i]->net != netlist->zero_net &&
+            input_list->pins[i]->net != netlist->one_net && input_list->pins[i]->net != netlist->pad_net) {
+            warning_message(NETLIST, node->loc, "Signal %s is not driven. padding with ground\n", input_list->pins[i]->name);
+            add_fanout_pin_to_net(netlist->zero_net, input_list->pins[i]);
+        }
+
+        nnode_t *not_g = make_not_gate(node, mark);
+        remap_pin_to_new_node(input_list->pins[i], not_g, 0);
+        npin_t *not_output = allocate_npin();
+        add_output_pin_to_node(not_g, not_output, 0);
+        nnet_t *net = allocate_nnet();
+        net->name = make_full_ref_name(NULL, NULL, NULL, not_g->name, 0);
+        add_driver_pin_to_net(net, not_output);
+        not_output = allocate_npin();
+        add_fanout_pin_to_net(net, not_output);
+        add_pin_to_signal_list(not_gates, not_output);
+
+        npin_t *pin = allocate_npin();
+        net = input_list->pins[i]->net;
+
+        add_fanout_pin_to_net(net, pin);
+
+        input_list->pins[i] = pin;
+    }
+
+    // Create AND gates and assign signals.
+    signal_list_t *return_list = init_signal_list();
+    for (long i = 0; i < num_outputs; i++) {
+        // Each output is connected to an and gate which is driven by a single permutation of the inputs.
+        nnode_t *and_g = make_1port_logic_gate(LOGICAL_AND, num_inputs, node, mark);
+
+        for (long j = 0; j < num_inputs; j++) {
+            // Look at the jth bit of i. If it's 0, take the negated signal.
+            long value = shift_left_value_with_overflow_check(0x1, j, and_g->loc);
+            value &= i;
+            value >>= j;
+
+            npin_t *pin = value ? input_list->pins[j] : not_gates->pins[j];
+
+            // Use the original not pins on the first iteration and the original input pins on the last.
+            if (i > 0 && i < num_outputs - 1)
+                pin = copy_input_npin(pin);
+
+            // Connect the signal to the output and gate.
+            add_input_pin_to_node(and_g, pin, j);
+        }
+
+        // Add output pin, net, and fanout pin.
+        npin_t *output = allocate_npin();
+        nnet_t *net = allocate_nnet();
+        add_output_pin_to_node(and_g, output, 0);
+        net->name = make_full_ref_name(NULL, NULL, NULL, and_g->name, 0);
+        add_driver_pin_to_net(net, output);
+        output = allocate_npin();
+        add_fanout_pin_to_net(net, output);
+
+        // Add the fanout pin (decoder output) to the return list.
+        add_pin_to_signal_list(return_list, output);
+    }
+
+    free_signal_list(not_gates);
+    return return_list;
+}
+
+/**
+ * (function: create_single_port_rom)
+ *
+ * @brief create a single port ram with the given spram signals
+ *
+ * @param signals spram signals
+ * @param node corresponding netlist node
+ *
+ * @return a new single port ram
+ */
+nnode_t *create_single_port_ram(sp_ram_signals *spram_signals, nnode_t *node)
+{
+    /* sanity checks */
+    oassert(spram_signals->clk != NULL);
+    oassert(spram_signals->we != NULL);
+    oassert(spram_signals->addr->count >= 1);
+    oassert(spram_signals->data->count >= 1);
+    if (spram_signals->out != NULL) {
+        oassert(spram_signals->data->count == spram_signals->out->count);
+    }
+
+    /* create a single port ram node */
+    nnode_t *spram = allocate_nnode(node->loc);
+
+    spram->type = MEMORY;
+    /* some information from ast node is needed in partial mapping */
+    char *hb_name = vtr::strdup(SINGLE_PORT_RAM_string);
+    spram->name = node_name(spram, hb_name);
+    spram->attributes->memory_id = vtr::strdup(node->attributes->memory_id);
+
+    /* Create a fake ast node. */
+    spram->related_ast_node = create_node_w_type(RAM, node->loc);
+    spram->related_ast_node->identifier_node = create_tree_node_id(hb_name, node->loc);
+
+    /* INPUTS */
+    /* hook address portd into spram */
+    add_input_port_to_memory(spram, spram_signals->addr, "addr");
+
+    /* hook data ports into spram */
+    add_input_port_to_memory(spram, spram_signals->data, "data");
+
+    /* hook enable pins to spram */
+    signal_list_t *we = init_signal_list();
+    add_pin_to_signal_list(we, spram_signals->we);
+    add_input_port_to_memory(spram, we, "we");
+
+    /* hook clk pin into spram */
+    signal_list_t *clk = init_signal_list();
+    add_pin_to_signal_list(clk, spram_signals->clk);
+    add_input_port_to_memory(spram, clk, "clk");
+
+    /* OUTPUT */
+    if (spram_signals->out == NULL) {
+        /* init the signal list */
+        spram_signals->out = init_signal_list();
+        for (int i = 0; i < spram_signals->data->count; i++) {
+            npin_t *new_pin = allocate_npin();
+            nnet_t *new_net = allocate_nnet();
+            /* add pin as the net driver */
+            add_driver_pin_to_net(new_net, new_pin);
+            /* store them into dpram signals */
+            add_pin_to_signal_list(spram_signals->out, new_pin);
+        }
+    }
+    add_output_port_to_memory(spram, spram_signals->out, "out");
+
+    /* already compatible with SPRAM config so we leave it as is */
+    sp_memory_list = insert_in_vptr_list(sp_memory_list, spram);
+    /* register the SPRAM in arch model to have the related model in BLIF for simulation */
+    register_memory_model(spram);
+
+    // CLEAN UP
+    free_signal_list(we);
+    free_signal_list(clk);
+
+    return (spram);
+}
+
+/**
+ * (function: create_dual_port_rom)
+ *
+ * @brief create a dual port ram with the given dpram signals
+ *
+ * @param signals dpram signals
+ * @param node corresponding netlist node
+ *
+ * @return a new dual port ram
+ */
+nnode_t *create_dual_port_ram(dp_ram_signals *dpram_signals, nnode_t *node)
+{
+    /* sanity checks */
+    oassert(dpram_signals->clk);
+    oassert((dpram_signals->we1) && (dpram_signals->we2));
+    oassert((dpram_signals->addr1) && (dpram_signals->addr2));
+    oassert((dpram_signals->data1) && (dpram_signals->data1));
+    oassert(dpram_signals->addr1->count == dpram_signals->addr2->count);
+    oassert(dpram_signals->data1->count == dpram_signals->data2->count);
+    oassert(dpram_signals->addr1->count >= 1 && dpram_signals->data1->count >= 1);
+    oassert(dpram_signals->addr2->count >= 1 && dpram_signals->data2->count >= 1);
+
+    if (dpram_signals->out1 != NULL) {
+        oassert(dpram_signals->data1->count == dpram_signals->out1->count);
+    }
+    if (dpram_signals->out2 != NULL) {
+        oassert(dpram_signals->data2->count == dpram_signals->out2->count);
+    }
+
+    /* create a dual port ram node */
+    nnode_t *dpram = allocate_nnode(node->loc);
+
+    dpram->type = MEMORY;
+    /* some information from ast node is needed in partial mapping */
+    char *hb_name = vtr::strdup(DUAL_PORT_RAM_string);
+    dpram->name = node_name(dpram, hb_name);
+    dpram->attributes->memory_id = vtr::strdup(node->attributes->memory_id);
+
+    /* Create a fake ast node. */
+    dpram->related_ast_node = create_node_w_type(RAM, node->loc);
+    dpram->related_ast_node->identifier_node = create_tree_node_id(hb_name, node->loc);
+
+    /* INPUTS */
+    /* hook address portd into dpram */
+    add_input_port_to_memory(dpram, dpram_signals->addr1, "addr1");
+    add_input_port_to_memory(dpram, dpram_signals->addr2, "addr2");
+
+    /* hook data ports into dpram */
+    add_input_port_to_memory(dpram, dpram_signals->data1, "data1");
+    add_input_port_to_memory(dpram, dpram_signals->data2, "data2");
+
+    /* hook enable pins to dpram */
+    signal_list_t *we1 = init_signal_list();
+    add_pin_to_signal_list(we1, dpram_signals->we1);
+    add_input_port_to_memory(dpram, we1, "we1");
+
+    signal_list_t *we2 = init_signal_list();
+    add_pin_to_signal_list(we2, dpram_signals->we2);
+    add_input_port_to_memory(dpram, we2, "we2");
+
+    /* hook clk pin into dpram */
+    signal_list_t *clk = init_signal_list();
+    add_pin_to_signal_list(clk, dpram_signals->clk);
+    add_input_port_to_memory(dpram, clk, "clk");
+
+    /* OUTPUT */
+    if (dpram_signals->out1 == NULL) {
+        /* init the signal list */
+        dpram_signals->out1 = init_signal_list();
+        for (int i = 0; i < dpram_signals->data1->count; i++) {
+            npin_t *new_pin = allocate_npin();
+            nnet_t *new_net = allocate_nnet();
+            /* add pin as the net driver */
+            add_driver_pin_to_net(new_net, new_pin);
+            /* store them into dpram signals */
+            add_pin_to_signal_list(dpram_signals->out1, new_pin);
+        }
+    }
+    add_output_port_to_memory(dpram, dpram_signals->out1, "out1");
+    if (dpram_signals->out2 == NULL) {
+        /* init the signal list */
+        dpram_signals->out2 = init_signal_list();
+        for (int i = 0; i < dpram_signals->data2->count; i++) {
+            npin_t *new_pin = allocate_npin();
+            nnet_t *new_net = allocate_nnet();
+            /* add pin as the net driver */
+            add_driver_pin_to_net(new_net, new_pin);
+            /* store them into dpram signals */
+            add_pin_to_signal_list(dpram_signals->out2, new_pin);
+        }
+    }
+    add_output_port_to_memory(dpram, dpram_signals->out2, "out2");
+
+    /* already compatible with DPRAM config so we leave it as is */
+    dp_memory_list = insert_in_vptr_list(dp_memory_list, dpram);
+    /* register the DPRAM in arch model to have the related model in BLIF for simulation */
+    register_memory_model(dpram);
+
+    // CLEAN UP
+    free_signal_list(we1);
+    free_signal_list(we2);
+    free_signal_list(clk);
+
+    return (dpram);
+}
+
+/**
+ * (function: register_memory_model)
+ *
+ * @brief register the corresponding memory hard if any is
+ * available in the given architecture
+ *
+ * @param mem pointing to the memory node
+ */
+void register_memory_model(nnode_t *mem)
+{
+    /* See if the hard block declared is supported by FPGA architecture */
+    t_model *hb_model = find_hard_block(mem->related_ast_node->identifier_node->types.identifier);
+
+    if (hb_model) {
+        /* Declare the hard block as used for the blif generation */
+        hb_model->used = 1;
+    }
+}
+
+/**
+ * (function: resolve_single_port_ram)
+ *
+ * @brief resolve the spram block by reordering the input signals
+ * to be compatible with Odin's partial mapping phase
+ *
+ * @param node pointing to a spram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void resolve_single_port_ram(nnode_t *node, uintptr_t traverse_mark_number, netlist_t * /* netlist */)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+    oassert(node->num_input_port_sizes == 4);
+    oassert(node->num_output_port_sizes == 1);
+
+    /* check if the node is a valid spram */ //@TODO
+    if (!is_blif_sp_ram(node))
+        error_message(RESOLVE, node->loc, "SPRAM (%s) ports mismatch with VTR single_port_ram hard block ports\n", node->name);
+
+    /**
+     * blif single port ram information
+     *
+     * ADDR:    input port [0]
+     * CLOCK:   input port [1]
+     * DATAIN:  input port [2]
+     * WE:      input port [3]
+     *
+     * DATAOUT: output port [0]
+     */
+
+    int SP_ADDR_width = node->input_port_sizes[0];
+    int SP_CLK_width = node->input_port_sizes[1]; // should be 1
+    int SP_DATA_width = node->input_port_sizes[2];
+    int SP_WE_width = node->input_port_sizes[3]; // should be 1
+    int SP_OUT_width = node->output_port_sizes[0];
+
+    /* validate the data width */
+    oassert(SP_CLK_width == 1 && SP_WE_width == 1);
+    oassert(SP_DATA_width == node->output_port_sizes[0]);
+
+    /* creating dpram node for the range of data width */
+    int offset = 0;
+    /* creating a new node */
+    sp_ram_signals *signals = init_sp_ram_signals();
+
+    /* INPUTS */
+    /* adding the addr signals */
+    for (int i = 0; i < SP_ADDR_width; i++) {
+        npin_t *pin = node->input_pins[offset + i];
+        /* detach from the main node, since it will be connected to a new dpram */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->addr, pin);
+    }
+    offset += SP_ADDR_width;
+
+    /* adding the clk signals */
+    npin_t *clk_pin = node->input_pins[offset];
+    /* detach from the main node, since it will be connected to a new dpram */
+    clk_pin->node->input_pins[clk_pin->pin_node_idx] = NULL;
+    signals->clk = clk_pin;
+    /* increment offset */
+    offset += 1;
+
+    /* adding the data signals */
+    for (int i = 0; i < SP_DATA_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->input_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->data, pin);
+    }
+    offset += SP_DATA_width;
+
+    /* adding the we signals */
+    npin_t *we_pin = node->input_pins[offset];
+    /* detach from the main node, since it will be connected to a new dpram */
+    we_pin->node->input_pins[we_pin->pin_node_idx] = NULL;
+    signals->we = we_pin;
+
+    /* OUTPUT */
+    /* adding the output signals */
+    offset = 0;
+    for (int i = 0; i < SP_OUT_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->output_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->output_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->out, pin);
+    }
+
+    /* creating a new spram with size modified input signals */
+    create_single_port_ram(signals, node);
+
+    // CLEAN UP
+    free_nnode(node);
+    free_sp_ram_signals(signals);
+}
+
+/**
+ * (function: resolve_dual_port_ram)
+ *
+ * @brief resolve the dpram block by reordering the input signals
+ * to be compatible with Odin's partial mapping phase
+ *
+ * @param node pointing to a dual port ram node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void resolve_dual_port_ram(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+    oassert(node->num_input_port_sizes == 7);
+    oassert(node->num_output_port_sizes == 2);
+
+    /* check if the node is a valid spram */ //@TODO
+    if (!is_blif_dp_ram(node))
+        error_message(RESOLVE, node->loc, "DPRAM (%s) ports mismatch with VTR dual_port_ram hard block ports\n", node->name);
+
+    /**
+     * blif dual port ram information
+     *
+     * ADDR1:    input port [0]
+     * ADDR2:    input port [1]
+     * CLOCK:    input port [2]
+     * DATAIN1:  input port [3]
+     * DATAIN2:  input port [4]
+     * WE1:      input port [5]
+     * WE2:      input port [6]
+     *
+     * DATAOUT1: output port [0]
+     * DATAOUT2: output port [1]
+     */
+    int DP_ADDR1_width = node->input_port_sizes[0];
+    int DP_ADDR2_width = node->input_port_sizes[1];
+    int DP_CLK_width = node->input_port_sizes[2]; // should be 1
+    int DP_DATA1_width = node->input_port_sizes[3];
+    int DP_DATA2_width = node->input_port_sizes[4];
+    int DP_WE1_width = node->input_port_sizes[5]; // should be 1
+    int DP_WE2_width = node->input_port_sizes[6]; // should be 1
+    int DP_OUT1_width = node->output_port_sizes[0];
+    int DP_OUT2_width = node->output_port_sizes[1];
+
+    /* validate the port width */
+    oassert((DP_CLK_width == 1) && (DP_WE1_width == 1) && (DP_WE2_width == 1));
+    oassert(DP_DATA1_width == DP_DATA2_width);
+    oassert(DP_DATA1_width == node->output_port_sizes[0]);
+    oassert(DP_DATA1_width == node->output_port_sizes[1]);
+
+    int max_addr_width = std::max(DP_ADDR1_width, DP_ADDR2_width);
+
+    /* creating dpram node for the range of data width */
+    int offset = 0;
+    /* creating a new node */
+    dp_ram_signals *signals = init_dp_ram_signals();
+
+    /* INPUTS */
+    /* adding the addr1 signals */
+    for (int i = 0; i < max_addr_width; i++) {
+        /* hook the addr1 pin to new node */
+        if (i < DP_ADDR1_width) {
+            npin_t *pin = node->input_pins[offset + i];
+            /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+            pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+            add_pin_to_signal_list(signals->addr1, pin);
+        } else {
+            add_pin_to_signal_list(signals->addr1, get_pad_pin(netlist));
+        }
+    }
+    offset += DP_ADDR1_width;
+
+    /* adding the addr2 signals */
+    for (int i = 0; i < max_addr_width; i++) {
+        /* hook the addr1 pin to new node */
+        if (i < DP_ADDR2_width) {
+            npin_t *pin = node->input_pins[offset + i];
+            /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+            pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+            add_pin_to_signal_list(signals->addr2, pin);
+        } else {
+            add_pin_to_signal_list(signals->addr2, get_pad_pin(netlist));
+        }
+    }
+    offset += DP_ADDR2_width;
+
+    /* adding the clk signals */
+    npin_t *clk_pin = node->input_pins[offset];
+    /* detach from the main node, since it will be connected to a new dpram */
+    clk_pin->node->input_pins[clk_pin->pin_node_idx] = NULL;
+    signals->clk = clk_pin;
+    /* increment offset */
+    offset += 1;
+
+    /* adding the data1 signals */
+    for (int i = 0; i < DP_DATA1_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->input_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->data1, pin);
+    }
+    offset += DP_DATA1_width;
+
+    /* adding the data2 signals */
+    for (int i = 0; i < DP_DATA2_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->input_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->data2, pin);
+    }
+    offset += DP_DATA2_width;
+
+    /* adding the we1 signals */
+    npin_t *we1_pin = node->input_pins[offset];
+    /* detach from the main node, since it will be connected to a new dpram */
+    we1_pin->node->input_pins[we1_pin->pin_node_idx] = NULL;
+    signals->we1 = we1_pin;
+    /* increment offset */
+    offset += 1;
+
+    /* adding the we2 signals */
+    npin_t *we2_pin = node->input_pins[offset];
+    /* detach from the main node, since it will be connected to a new dpram */
+    we2_pin->node->input_pins[we2_pin->pin_node_idx] = NULL;
+    signals->we2 = we2_pin;
+
+    /* OUTPUT */
+    offset = 0;
+    /* adding the output1 signals */
+    for (int i = 0; i < DP_OUT1_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->output_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->output_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->out1, pin);
+    }
+    offset += DP_OUT1_width;
+
+    /* adding the output1 signals */
+    for (int i = 0; i < DP_OUT2_width; i++) {
+        /* hook the data1 pin to new node */
+        npin_t *pin = node->output_pins[offset + i];
+        /* in case of padding, pins have not been remapped, need to detach them from the BRAM node */
+        pin->node->output_pins[pin->pin_node_idx] = NULL;
+
+        add_pin_to_signal_list(signals->out2, pin);
+    }
+
+    /* creating a new dpram with size modified input signals */
+    create_dual_port_ram(signals, node);
+
+    // CLEAN UP
+    free_nnode(node);
+    free_dp_ram_signals(signals);
+}
diff --git a/parmys-plugin/core/memory.h b/parmys-plugin/core/memory.h
new file mode 100644
index 000000000..3b39096f5
--- /dev/null
+++ b/parmys-plugin/core/memory.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+
+#include "odin_types.h"
+
+extern vtr::t_linked_vptr *sp_memory_list;
+extern vtr::t_linked_vptr *dp_memory_list;
+extern t_model *single_port_rams;
+extern t_model *dual_port_rams;
+
+#define HARD_RAM_ADDR_LIMIT 33
+#define SOFT_RAM_ADDR_LIMIT 10
+
+struct sp_ram_signals {
+    signal_list_t *addr;
+    signal_list_t *data;
+    signal_list_t *out;
+    npin_t *we;
+    npin_t *clk;
+};
+
+struct dp_ram_signals {
+    signal_list_t *addr1;
+    signal_list_t *addr2;
+    signal_list_t *data1;
+    signal_list_t *data2;
+    signal_list_t *out1;
+    signal_list_t *out2;
+    npin_t *we1;
+    npin_t *we2;
+    npin_t *clk;
+};
+
+extern sp_ram_signals *init_sp_ram_signals();
+extern dp_ram_signals *init_dp_ram_signals();
+
+long get_sp_ram_split_depth();
+long get_dp_ram_split_depth();
+
+sp_ram_signals *get_sp_ram_signals(nnode_t *node);
+void free_sp_ram_signals(sp_ram_signals *signalsvar);
+
+dp_ram_signals *get_dp_ram_signals(nnode_t *node);
+void free_dp_ram_signals(dp_ram_signals *signalsvar);
+
+bool is_sp_ram(nnode_t *node);
+bool is_dp_ram(nnode_t *node);
+
+bool is_blif_sp_ram(nnode_t *node);
+bool is_blif_dp_ram(nnode_t *node);
+
+void check_memories_and_report_distribution();
+
+long get_sp_ram_depth(nnode_t *node);
+long get_dp_ram_depth(nnode_t *node);
+long get_sp_ram_width(nnode_t *node);
+long get_dp_ram_width(nnode_t *node);
+
+void split_sp_memory_depth(nnode_t *node, int split_size);
+void split_dp_memory_depth(nnode_t *node, int split_size);
+void split_sp_memory_width(nnode_t *node, int target_size);
+void split_dp_memory_width(nnode_t *node, int target_size);
+void iterate_memories(netlist_t *netlist);
+void free_memory_lists();
+
+void instantiate_soft_single_port_ram(nnode_t *node, short mark, netlist_t *netlist);
+void instantiate_soft_dual_port_ram(nnode_t *node, short mark, netlist_t *netlist);
+
+signal_list_t *create_decoder(nnode_t *node, short mark, signal_list_t *input_list, netlist_t *netlist);
+
+extern void add_input_port_to_memory(nnode_t *node, signal_list_t *signalsvar, const char *port_name);
+extern void add_output_port_to_memory(nnode_t *node, signal_list_t *signalsvar, const char *port_name);
+extern void remap_input_port_to_memory(nnode_t *node, signal_list_t *signalsvar, const char *port_name);
+
+extern nnode_t *create_single_port_ram(sp_ram_signals *spram_signals, nnode_t *node);
+extern nnode_t *create_dual_port_ram(dp_ram_signals *dpram_signals, nnode_t *node);
+
+extern void register_memory_model(nnode_t *mem);
+
+extern void resolve_single_port_ram(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void resolve_dual_port_ram(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+#endif // _MEMORY_H_
diff --git a/parmys-plugin/core/multiplier.cc b/parmys-plugin/core/multiplier.cc
new file mode 100644
index 000000000..befb0b337
--- /dev/null
+++ b/parmys-plugin/core/multiplier.cc
@@ -0,0 +1,1891 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "multiplier.h"
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include "partial_map.h"
+#include "read_xml_arch_file.h"
+#include <algorithm>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+
+#include "adder.h"
+
+#include "vtr_list.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+#include "parmys_utils.h"
+
+using vtr::insert_in_vptr_list;
+using vtr::t_linked_vptr;
+
+t_model *hard_multipliers = NULL;
+t_linked_vptr *mult_list = NULL;
+int min_mult = 0;
+int *mults = NULL;
+
+void record_mult_distribution(nnode_t *node);
+void terminate_mult_distribution();
+void init_split_multiplier(nnode_t *node, nnode_t *ptr, int offa, int a, int offb, int b, nnode_t *node_a, nnode_t *node_b);
+void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b);
+void split_multiplier_a(nnode_t *node, int a0, int a1, int b);
+void split_multiplier_b(nnode_t *node, int a, int b1, int b0);
+void pad_multiplier(nnode_t *node, netlist_t *netlist);
+void split_soft_multiplier(nnode_t *node, netlist_t *netlist);
+static mult_port_stat_e is_constant_multipication(nnode_t *node, netlist_t *netlist);
+static signal_list_t *implement_constant_multipication(nnode_t *node, mult_port_stat_e port_status, short mark, netlist_t *netlist);
+static nnode_t *perform_const_mult_optimization(mult_port_stat_e mult_port_stat, nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+static void cleanup_mult_old_node(nnode_t *nodeo, netlist_t *netlist);
+
+// data structure representing a row of bits an adder tree
+struct AdderTreeRow {
+    // the shift of this row from the least significant bit of the multiplier output
+    size_t shift;
+    // array representing the bits in the row, each bit is a node
+    // pointer and the index of this bit in this node output array.
+    std::vector<std::pair<nnode_t *, int>> bits;
+};
+
+/*---------------------------------------------------------------------------
+ * (function: instantiate_simple_soft_multiplier )
+ * Sample 4x4 multiplier to help understand logic.
+ *
+ * 					a3 	a2	a1	a0
+ *					b3 	b2 	b1 	b0
+ *					---------------------------
+ *					c03	c02	c01	c00
+ *			+	c13	c12	c11	c10
+ *			-----------------------------------
+ *			r14	r13	r12	r11	r10
+ *		+	c23	c22	c21	c20
+ *		-----------------------------------
+ *		r24	r23	r22	r21	r20
+ *	+	c33	c32	c31	c30
+ *	------------------------------------
+ *	o7	o6	o5	o4	o3	o2	o1	o0
+ *
+ *	In the first case will be c01
+ *-------------------------------------------------------------------------*/
+void instantiate_simple_soft_multiplier(nnode_t *node, short mark, netlist_t *netlist)
+{
+    int width_a;
+    int width_b;
+    int width;
+    int multiplier_width;
+    int multiplicand_width;
+    nnode_t **adders_for_partial_products;
+    nnode_t ***partial_products;
+    int multiplicand_offset_index;
+    int multiplier_offset_index;
+    int current_index;
+
+    /* need for an carry-ripple-adder for each of the bits of port B. */
+    /* good question of which is better to put on the bottom of multiplier.  Larger means more smaller adds, or small is
+     * less large adds */
+    oassert(node->num_output_pins > 0);
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->num_output_port_sizes == 1);
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width = node->output_port_sizes[0];
+    multiplicand_width = width_b;
+    multiplier_width = width_a;
+    /* offset is related to which multport is chosen as the multiplicand */
+    multiplicand_offset_index = width_a;
+    multiplier_offset_index = 0;
+
+    adders_for_partial_products = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * multiplicand_width - 1);
+
+    /* need to generate partial products for each bit in width B. */
+    partial_products = (nnode_t ***)vtr::malloc(sizeof(nnode_t **) * multiplicand_width);
+
+    /* generate the AND partial products */
+    for (int i = 0; i < multiplicand_width; i++) {
+        /* create the memory for each AND gate needed for the levels of partial products */
+        partial_products[i] = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * multiplier_width);
+
+        if (i < multiplicand_width - 1) {
+            adders_for_partial_products[i] = make_2port_gate(ADD, multiplier_width + 1, multiplier_width + 1, multiplier_width + 1, node, mark);
+        }
+
+        for (int j = 0; j < multiplier_width; j++) {
+            /* create each one of the partial products */
+            partial_products[i][j] = make_1port_logic_gate(LOGICAL_AND, 2, node, mark);
+        }
+    }
+
+    /* generate the connections to the AND gates */
+    for (int i = 0; i < multiplicand_width; i++) {
+        for (int j = 0; j < multiplier_width; j++) {
+            /* hookup the input of B to each AND gate */
+            if (j == 0) {
+                /* IF - this is the first time we are mapping multiplicand port then can remap */
+                remap_pin_to_new_node(node->input_pins[i + multiplicand_offset_index], partial_products[i][j], 0);
+            } else {
+                /* ELSE - this needs to be a new output of the multiplicand port */
+                add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[i][0]->input_pins[0]), 0);
+            }
+
+            /* hookup the input of the multiplier to each AND gate */
+            if (i == 0) {
+                /* IF - this is the first time we are mapping multiplier port then can remap */
+                remap_pin_to_new_node(node->input_pins[j + multiplier_offset_index], partial_products[i][j], 1);
+            } else {
+                /* ELSE - this needs to be a new output of the multiplier port */
+                add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[0][j]->input_pins[1]), 1);
+            }
+        }
+    }
+
+    /* hookup each of the adders */
+    for (int i = 0; i < multiplicand_width - 1; i++) // -1 since the first stage is a combo of partial products while all others are part of tree
+    {
+        for (int j = 0; j < multiplier_width + 1; j++) // +1 since adders are one greater than multwidth to pass carry
+        {
+            /* join to port 1 of the add one of the partial products.  */
+            if (i == 0) {
+                /* IF - this is the first addition row, then adding two sets of partial products and first set is from the c0* */
+                if (j < multiplier_width - 1) {
+                    /* IF - we just take an element of the first list c[0][j+1]. */
+                    connect_nodes(partial_products[i][j + 1], 0, adders_for_partial_products[i], j);
+                } else {
+                    /* ELSE - this is the last input to the first adder, then we pass in 0 since no carry yet */
+                    add_input_pin_to_node(adders_for_partial_products[i], get_zero_pin(netlist), j);
+                }
+            } else if (j < multiplier_width) {
+                /* ELSE - this is the standard situation when we need to hookup this adder with a previous adder, r[i-1][j+1] */
+                connect_nodes(adders_for_partial_products[i - 1], j + 1, adders_for_partial_products[i], j);
+            } else {
+                add_input_pin_to_node(adders_for_partial_products[i], get_zero_pin(netlist), j);
+            }
+
+            if (j < multiplier_width) {
+                /* IF - this is not most significant bit then just add current partial product */
+                connect_nodes(partial_products[i + 1][j], 0, adders_for_partial_products[i], j + multiplier_width + 1);
+            } else {
+                add_input_pin_to_node(adders_for_partial_products[i], get_zero_pin(netlist), j + multiplier_width + 1);
+            }
+        }
+    }
+
+    current_index = 0;
+    /* hookup the outputs */
+    for (int i = 0; i < width; i++) {
+        if (multiplicand_width == 1) {
+            // this is undealt with
+            error_message(RESOLVE, node->loc, "%s", "Cannot create soft multiplier with multiplicand width of 1.\n");
+        } else if (i == 0) {
+            /* IF - this is the LSbit, then we use a pass through from the partial product */
+            remap_pin_to_new_node(node->output_pins[i], partial_products[0][0], 0);
+        } else if (i < multiplicand_width - 1) {
+            /* ELSE IF - these are the middle values that come from the LSbit of partial adders */
+            remap_pin_to_new_node(node->output_pins[i], adders_for_partial_products[i - 1], 0);
+        } else {
+            if (current_index > multiplier_width) {
+                /* output pins greater than 2X multiplier width will be driven with pad node */
+                nnode_t *buf_node = make_1port_gate(BUF_NODE, 1, 1, node, mark);
+                /* hook a pad pin into buf node */
+                add_input_pin_to_node(buf_node, get_pad_pin(netlist), 0);
+                /* hook the over size output pin to into the buf node */
+                remap_pin_to_new_node(node->output_pins[i], buf_node, 0);
+            } else {
+                /* ELSE - the final outputs are straight from the outputs of the last adder */
+                remap_pin_to_new_node(node->output_pins[i], adders_for_partial_products[multiplicand_width - 2], current_index);
+            }
+            current_index++;
+        }
+    }
+
+    /* soft map the adders if they need to be mapped */
+    for (int i = 0; i < multiplicand_width - 1; i++) {
+        instantiate_add_w_carry(adders_for_partial_products[i], mark, netlist);
+    }
+
+    /* Cleanup everything */
+    if (adders_for_partial_products != NULL) {
+        for (int i = 0; i < multiplicand_width - 1; i++) {
+            free_nnode(adders_for_partial_products[i]);
+        }
+        vtr::free(adders_for_partial_products);
+    }
+    /* generate the AND partial products */
+    for (int i = 0; i < multiplicand_width; i++) {
+        /* create the memory for each AND gate needed for the levels of partial products */
+        if (partial_products[i] != NULL) {
+            vtr::free(partial_products[i]);
+        }
+    }
+    if (partial_products != NULL) {
+        vtr::free(partial_products);
+    }
+}
+
+/**
+ * --------------------------------------------------------------------------
+ * (function: implement_constant_multipication)
+ *
+ * @brief implementing constant multipication utilizing shift and ADD operations
+ *
+ * @note this function should call before partial mapping phase
+ * since some logic need to be softened
+ *
+ * @param node pointer to the multipication netlist node
+ * @param port_status showing which value is constant, which is variable
+ * @param mark a unique DFS traversal number
+ * @param netlist pointer to the current netlist
+ *
+ * @return output signal
+ * -------------------------------------------------------------------------*/
+static signal_list_t *implement_constant_multipication(nnode_t *node, mult_port_stat_e port_status, short mark, netlist_t *netlist)
+{
+    /* validate the port sizes */
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->num_output_port_sizes == 1);
+
+    signal_list_t *return_value = init_signal_list();
+
+    /**
+     * Multiply ports
+     * IN1: (n bits)        input_port[0]
+     * IN2: (m bits)        input_port[1]
+     * OUT: min(m, n) bits  output_port[0]
+     */
+
+    int IN1_width = node->input_port_sizes[0];
+
+    int const_operand_offset = (port_status == mult_port_stat_e::MULTIPICAND_CONSTANT) ? IN1_width : 0;
+    int const_operand_width = node->input_port_sizes[(port_status == mult_port_stat_e::MULTIPICAND_CONSTANT) ? 1 : 0];
+
+    int variable_operand_offset = (port_status == mult_port_stat_e::MULTIPICAND_CONSTANT) ? 0 : IN1_width;
+    int variable_operand_width = node->num_input_pins - const_operand_width;
+    operation_list variable_operand_signedness =
+      (port_status == mult_port_stat_e::MULTIPICAND_CONSTANT) ? node->attributes->port_a_signed : node->attributes->port_b_signed;
+
+    /* after each level one bit will be added to the width of results */
+    int width = node->num_output_pins;
+
+    /* container for constatnt operand */
+    signal_list_t *const_operand = init_signal_list();
+    for (int i = 0; i < const_operand_width; i++) {
+        add_pin_to_signal_list(const_operand, node->input_pins[const_operand_offset + i]);
+    }
+    /* container for variable operand */
+    signal_list_t *variable_operand = init_signal_list();
+    for (int i = 0; i < variable_operand_width; i++) {
+        add_pin_to_signal_list(variable_operand, node->input_pins[variable_operand_offset + i]);
+    }
+
+    /* netlist GND and VCC net */
+    nnet_t *gnd_net = netlist->zero_net;
+    nnet_t *vcc_net = netlist->one_net;
+
+    int internal_outputs_size = const_operand_width;
+    /* to keep the record of internal outputs for connection purposes */
+    signal_list_t **internal_outputs = (signal_list_t **)vtr::calloc(internal_outputs_size, sizeof(signal_list_t *));
+    /* implementing the multipication using shift and add operation */
+    for (int i = 0; i < node->num_output_pins + 1; i++) {
+        npin_t *pin;
+        /* checking a couple conditions to avoid going further if there is not needed */
+        if (i == node->num_output_pins || i == const_operand_width) {
+            internal_outputs_size = i;
+            /* initializing the return value */
+            for (int j = 0; j < internal_outputs[i - 1]->count; j++) {
+                add_pin_to_signal_list(return_value, internal_outputs[i - 1]->pins[j]);
+            }
+            break;
+        } else {
+            pin = const_operand->pins[i];
+        }
+        /* init the interanl outputs signal list */
+        internal_outputs[i] = init_signal_list();
+
+        /* if the pin is GND we pass */
+        if (!strcmp(pin->net->name, gnd_net->name)) {
+            for (int j = 0; j < width; j++) {
+                /* if the first bit of const_operand is zero we need to initiate the multipication by zero pins */
+                npin_t *internal_output_pin = (i == 0) ? get_zero_pin(netlist) : internal_outputs[i - 1]->pins[j];
+                add_pin_to_signal_list(internal_outputs[i], internal_output_pin);
+            }
+        }
+        /* the const_operand pin is connected to VCC */
+        else if (!strcmp(pin->net->name, vcc_net->name)) {
+            /* for the first round we do not need to shift */
+            if (i == 0) {
+                for (int j = 0; j < width; j++) {
+                    if (j < variable_operand_width) {
+                        add_pin_to_signal_list(internal_outputs[0], copy_input_npin(variable_operand->pins[j]));
+                    } else {
+                        add_pin_to_signal_list(internal_outputs[0], get_zero_pin(netlist));
+                    }
+                }
+            } else {
+                /*****************************************************************************************/
+                /*************************************** SHIFT_NODE **************************************/
+                /*****************************************************************************************/
+                /**
+                 * create a shift node to shift the variable port based on the i idx
+                 *
+                 * (shift node)
+                 * IN1: variable_operand of the multiplier
+                 * IN2: shift value (const_operand_width maximum size)
+                 * OUT: shifted IN1 (width)
+                 *
+                 */
+                nnode_t *shift_node = make_2port_gate(SL, width, width, width, node, mark);
+                /* connecting the shift value pins */
+                signal_list_t *shift_value = create_constant_signal(i, width, netlist);
+
+                /* keeping the shift output nodes for adding with the previous stage internal outputs */
+                signal_list_t *shift_outputs = init_signal_list();
+
+                int pad_pin = variable_operand->count - 1;
+                for (int j = 0; j < width; j++) {
+                    if (j < variable_operand_width) {
+                        /* connecing the first input of the shift node */
+                        add_input_pin_to_node(shift_node, copy_input_npin(variable_operand->pins[j]), j);
+                    } else {
+                        add_input_pin_to_node(
+                          shift_node,
+                          (variable_operand_signedness == SIGNED) ? copy_input_npin(variable_operand->pins[pad_pin]) : get_zero_pin(netlist), j);
+                    }
+
+                    /* hook shift value pins into the shift node */
+                    add_input_pin_to_node(shift_node, shift_value->pins[j], width + j);
+
+                    /* Specifying the level_muxes outputs */
+                    // Connect output pin to related input pin
+                    npin_t *var_op_out1 = allocate_npin();
+                    npin_t *var_op_out2 = allocate_npin();
+                    nnet_t *var_op_net = allocate_nnet();
+                    var_op_net->name = make_full_ref_name(NULL, NULL, NULL, shift_node->name, j);
+                    /* hook the output pin into the node */
+                    add_output_pin_to_node(shift_node, var_op_out1, j);
+                    /* hook up new pin 1 into the new net */
+                    add_driver_pin_to_net(var_op_net, var_op_out1);
+                    /* hook up the new pin 2 to this new net */
+                    add_fanout_pin_to_net(var_op_net, var_op_out2);
+
+                    /* adding the output pin to the shoft output signal container */
+                    add_pin_to_signal_list(shift_outputs, var_op_out2);
+                }
+
+                /*****************************************************************************************/
+                /**************************************** ADD_NODE ***************************************/
+                /*****************************************************************************************/
+                nnode_t *add_node = make_2port_gate(ADD, width, width, width, node, mark);
+                add_list = insert_in_vptr_list(add_list, add_node);
+                /* connecting add node input pins */
+                for (int j = 0; j < width; j++) {
+                    /* connecting the previous stage internal outputs as the first add inputs */
+                    add_input_pin_to_node(add_node, internal_outputs[i - 1]->pins[j], j);
+
+                    /* connecting the shift output pins as the second input */
+                    add_input_pin_to_node(add_node, shift_outputs->pins[j], width + j);
+
+                    /* creating new output pins and adding to the internal outputs for next stages */
+                    // Connect output pin to related input pin
+                    npin_t *add_op_out1 = allocate_npin();
+                    npin_t *add_op_out2 = allocate_npin();
+                    nnet_t *add_op_net = allocate_nnet();
+                    add_op_net->name = make_full_ref_name(NULL, NULL, NULL, add_node->name, j);
+                    /* hook the output pin into the node */
+                    add_output_pin_to_node(add_node, add_op_out1, j);
+                    /* hook up new pin 1 into the new net */
+                    add_driver_pin_to_net(add_op_net, add_op_out1);
+                    /* hook up the new pin 2 to this new net */
+                    add_fanout_pin_to_net(add_op_net, add_op_out2);
+
+                    /* adding the output pin to the shoft output signal container */
+                    add_pin_to_signal_list(internal_outputs[i], add_op_out2);
+                }
+
+                // CLEAN UP
+                free_signal_list(shift_value);
+                free_signal_list(shift_outputs);
+            }
+        }
+    }
+
+    // CLEAN UP
+    free_signal_list(const_operand);
+    free_signal_list(variable_operand);
+
+    for (int i = 0; i < internal_outputs_size; i++) {
+        if (internal_outputs[i])
+            free_signal_list(internal_outputs[i]);
+    }
+    vtr::free(internal_outputs);
+
+    return (return_value);
+}
+
+/**
+ * --------------------------------------------------------------------------
+ * (function: connect_constant_mult_outputs)
+ *
+ * @brief connecting the constant multipication
+ * pins to the main mult node
+ *
+ * @param node pointer to the multipication netlist node
+ * @param output_signal_list list of pins
+ * @param netlist pointer to the current netlist file
+ * -------------------------------------------------------------------------*/
+void connect_constant_mult_outputs(nnode_t *node, signal_list_t *output_signal_list)
+{
+    /* validate the size of output width and num of signals */
+    int output_width = node->num_output_pins;
+    oassert(output_width == output_signal_list->count);
+
+    /* hook the output signals into the node output */
+    for (int i = 0; i < output_signal_list->count; i++) {
+        npin_t *pin = output_signal_list->pins[i];
+        /* join nets of the output pin and the calculated pin */
+        nnode_t *buf_node = make_1port_gate(BUF_NODE, 1, 1, node, node->traverse_visited);
+
+        /* connect the mults output pins as buf node driver */
+        add_input_pin_to_node(buf_node, pin, 0);
+        /* remap the main mult output pin to the buf node output pin */
+        remap_pin_to_new_node(node->output_pins[i], buf_node, 0);
+    }
+
+    // CLEAN UP
+    free_signal_list(output_signal_list);
+    for (int i = 0; i < node->num_input_pins; i++) {
+        npin_t *pin = node->input_pins[i];
+
+        /* detach from input nets */
+        remove_fanout_pins_from_net(pin->net, pin, pin->pin_net_idx);
+
+        /* free pin */
+        free_npin(node->input_pins[i]);
+        node->input_pins[i] = NULL;
+    }
+
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: init_mult_distribution)
+ *-------------------------------------------------------------------------*/
+void init_mult_distribution()
+{
+    oassert(hard_multipliers != NULL);
+    int len = (1 + hard_multipliers->inputs->size) * (1 + hard_multipliers->inputs->next->size);
+    mults = (int *)vtr::calloc(len, sizeof(int));
+}
+
+/*---------------------------------------------------------------------------
+ * (function: record_mult_distribution)
+ *-------------------------------------------------------------------------*/
+void record_mult_distribution(nnode_t *node)
+{
+    int a, b;
+
+    oassert(hard_multipliers != NULL);
+    oassert(node != NULL);
+
+    a = node->input_port_sizes[0];
+    b = node->input_port_sizes[1];
+
+    mults[a * hard_multipliers->inputs->size + b] += 1;
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: report_mult_distribution)
+ *-------------------------------------------------------------------------*/
+void report_mult_distribution()
+{
+    long num_total = 0;
+
+    if (hard_multipliers == NULL)
+        return;
+
+    log("\nHard Multiplier Distribution\n");
+    log("============================\n");
+    for (long i = 0; i <= hard_multipliers->inputs->size; i++) {
+        for (long j = 1; j <= hard_multipliers->inputs->next->size; j++) {
+            if (mults[i * hard_multipliers->inputs->size + j] != 0) {
+                num_total += mults[i * hard_multipliers->inputs->size + j];
+                log("%ld X %ld => %d\n", i, j, mults[i * hard_multipliers->inputs->size + j]);
+            }
+        }
+    }
+    log("\n");
+    log("\nTotal # of multipliers = %ld\n", num_total);
+    // vtr::free(mults);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: find_hard_multipliers)
+ *-------------------------------------------------------------------------*/
+void find_hard_multipliers()
+{
+    hard_multipliers = Arch.models;
+    min_mult = configuration.min_hard_multiplier;
+    while (hard_multipliers != NULL) {
+        if (strcmp(hard_multipliers->name, "multiply") == 0) {
+            init_mult_distribution();
+            return;
+        } else {
+            hard_multipliers = hard_multipliers->next;
+        }
+    }
+
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: declare_hard_multiplier)
+ *-------------------------------------------------------------------------*/
+void declare_hard_multiplier(nnode_t *node)
+{
+    t_multiplier *tmp;
+    int width_a, width_b, width, swap;
+
+    /* See if this size instance of multiplier exists? */
+    if (hard_multipliers == NULL)
+        warning_message(NETLIST, node->loc, "%s\n", "Instantiating Mulitpliers where hard multipliers do not exist");
+
+    tmp = (t_multiplier *)hard_multipliers->instances;
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width = node->output_port_sizes[0];
+    if (width_a < width_b) /* Make sure a is bigger than b */
+    {
+        swap = width_b;
+        width_b = width_a;
+        width_a = swap;
+    }
+    while (tmp != NULL) {
+        if ((tmp->size_a == width_a) && (tmp->size_b == width_b) && (tmp->size_out == width))
+            return;
+        else
+            tmp = tmp->next;
+    }
+
+    /* Does not exist - must create an instance */
+    tmp = (t_multiplier *)vtr::malloc(sizeof(t_multiplier));
+    tmp->next = (t_multiplier *)hard_multipliers->instances;
+    hard_multipliers->instances = tmp;
+    tmp->size_a = width_a;
+    tmp->size_b = width_b;
+    tmp->size_out = width;
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: instantiate_hard_multiplier )
+ *-------------------------------------------------------------------------*/
+void instantiate_hard_multiplier(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    oassert(node && "node is NULL to instanciate hard multiplier");
+
+    declare_hard_multiplier(node);
+
+    std::string node_name = "";
+    if (node->name) {
+        node_name = node->name;
+        vtr::free(node->name);
+        node->name = NULL;
+    }
+
+    if (node->num_output_pins <= 0) {
+        /* wide input first :) */
+        int portA = 0;
+        int portB = 1;
+        if (node->input_port_sizes[1] > node->input_port_sizes[0]) {
+            portA = 1;
+            portB = 0;
+        }
+        std::string tmp(node_name + "_" + std::to_string(node->input_port_sizes[portA]) + "_" + std::to_string(node->input_port_sizes[portB]) + "_" +
+                        std::to_string(node->output_port_sizes[0]));
+        node->name = vtr::strdup(tmp.c_str());
+    } else {
+        /* Give names to the output pins */
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->name) {
+                vtr::free(node->output_pins[i]->name);
+            }
+            // build the output string
+            std::string tmp(node_name + "[" + std::to_string(node->output_pins[i]->pin_node_idx) + "]");
+            node->output_pins[i]->name = vtr::strdup(tmp.c_str());
+        }
+        node->name = vtr::strdup(node->output_pins[node->num_output_pins - 1]->name);
+    }
+    node->traverse_visited = mark;
+    return;
+}
+
+void add_the_blackbox_for_mults_yosys(Yosys::Design *design)
+{
+    int hard_mult_inputs;
+    t_multiplier *muls;
+    t_model_ports *ports;
+    char *pa, *pb, *po;
+
+    /* Check to make sure this target architecture has hard multipliers */
+    if (hard_multipliers == NULL)
+        return;
+
+    /* Get the names of the ports for the multiplier */
+    ports = hard_multipliers->inputs;
+    pb = ports->name;
+    ports = ports->next;
+    pa = ports->name;
+    po = hard_multipliers->outputs->name;
+
+    /* find the multiplier devices in the tech library */
+    muls = (t_multiplier *)(hard_multipliers->instances);
+    if (muls == NULL) /* No multipliers instantiated */
+        return;
+
+    /* simplified way of getting the multsize, but fine for quick example */
+    while (muls != NULL) {
+        /* write out this multiplier model */
+        std::string mul_name;
+        if (configuration.fixed_hard_multiplier != 0)
+            mul_name = "multiply";
+        else
+            mul_name = Yosys::stringf("mult_%d_%d_%d", muls->size_a, muls->size_b, muls->size_out);
+
+        Yosys::RTLIL::Module *module = nullptr;
+
+        Yosys::hashlib::dict<Yosys::RTLIL::IdString, std::pair<int, bool>> wideports_cache;
+
+        module = new Yosys::RTLIL::Module;
+        module->name = Yosys::RTLIL::escape_id(mul_name);
+
+        if (design->module(module->name))
+            Yosys::log_error("Duplicate definition of module %s!\n", Yosys::log_id(module->name));
+        design->add(module);
+
+        /* add the inputs */
+        hard_mult_inputs = muls->size_a + muls->size_b;
+        for (int i = 0; i < hard_mult_inputs; i++) {
+            std::string w_name;
+            if (i < muls->size_a) {
+                w_name = Yosys::stringf("%s[%d]", pa, i);
+            } else {
+                w_name = Yosys::stringf("%s[%d]", pb, i - muls->size_a);
+            }
+
+            Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+
+            wire->port_input = true;
+
+            std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+            if (!wp.first.empty() && wp.second >= 0) {
+                wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                wideports_cache[wp.first].second = true;
+            }
+            // }
+        }
+
+        for (int i = 0; i < muls->size_out; i++) {
+            std::string w_name = Yosys::stringf("%s[%d]", po, i);
+
+            Yosys::RTLIL::Wire *wire = to_wire(w_name, module);
+
+            wire->port_output = true;
+
+            std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(w_name);
+            if (!wp.first.empty() && wp.second >= 0) {
+                wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                wideports_cache[wp.first].second = false;
+            }
+        }
+
+        handle_wideports_cache(&wideports_cache, module);
+
+        module->fixup_ports();
+        wideports_cache.clear();
+
+        module->attributes[Yosys::ID::blackbox] = Yosys::RTLIL::Const(1);
+
+        muls = muls->next;
+    }
+
+    free_multipliers();
+}
+
+/*-------------------------------------------------------------------------
+ * (function: define_mult_function)
+ *-----------------------------------------------------------------------*/
+void define_mult_function_yosys(nnode_t *node, Yosys::Module *module, Yosys::Design *design)
+{
+    oassert(node->input_port_sizes[0] > 0);
+    oassert(node->input_port_sizes[1] > 0);
+    oassert(node->output_port_sizes[0] > 0);
+
+    std::string cell_type_name;
+
+    int flip = false;
+
+    if (configuration.fixed_hard_multiplier != 0) {
+        cell_type_name = "multiply";
+    } else {
+        if (node->input_port_sizes[0] > node->input_port_sizes[1]) {
+            cell_type_name = Yosys::stringf("mult_%d_%d_%d", node->input_port_sizes[0], node->input_port_sizes[1], node->output_port_sizes[0]);
+
+            flip = false;
+        } else {
+            cell_type_name = Yosys::stringf("mult_%d_%d_%d", node->input_port_sizes[1], node->input_port_sizes[0], node->output_port_sizes[0]);
+
+            flip = true;
+        }
+    }
+
+    Yosys::IdString celltype = Yosys::RTLIL::escape_id(cell_type_name);
+    Yosys::RTLIL::Cell *cell = module->addCell(NEW_ID, celltype);
+
+    Yosys::hashlib::dict<Yosys::RTLIL::IdString, Yosys::hashlib::dict<int, Yosys::SigBit>> cell_wideports_cache;
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        std::string p, q;
+        if (i < node->input_port_sizes[flip ? 1 : 0]) {
+            int input_index = flip ? i + node->input_port_sizes[0] : i;
+            nnet_t *net = node->input_pins[input_index]->net;
+            oassert(net->num_driver_pins == 1);
+            npin_t *driver_pin = net->driver_pins[0];
+
+            p = Yosys::stringf("%s[%d]", hard_multipliers->inputs->next->name, i);
+
+            if (!driver_pin->name)
+                q = driver_pin->node->name;
+            else
+                q = driver_pin->name;
+        } else {
+            int input_index = flip ? i - node->input_port_sizes[1] : i;
+            nnet_t *net = node->input_pins[input_index]->net;
+            oassert(net->num_driver_pins == 1);
+            npin_t *driver_pin = net->driver_pins[0];
+
+            long index = flip ? i - node->input_port_sizes[1] : i - node->input_port_sizes[0];
+
+            p = Yosys::stringf("%s[%ld]", hard_multipliers->inputs->name, index);
+
+            if (!driver_pin->name)
+                q = driver_pin->node->name;
+            else
+                q = driver_pin->name;
+        }
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+    }
+
+    for (int i = 0; i < node->num_output_pins; i++) {
+        std::string p, q;
+        p = Yosys::stringf("%s[%d]", hard_multipliers->outputs->name, i);
+        q = node->output_pins[i]->name;
+
+        std::pair<Yosys::RTLIL::IdString, int> wp = wideports_split(p);
+        if (wp.first.empty())
+            cell->setPort(Yosys::RTLIL::escape_id(p), to_wire(q, module));
+        else
+            cell_wideports_cache[wp.first][wp.second] = to_wire(q, module);
+    }
+
+    handle_cell_wideports_cache(&cell_wideports_cache, design, module, cell);
+
+    return;
+}
+
+/*-----------------------------------------------------------------------
+ * (function: init_split_multiplier)
+ *	Create a dummy multiplier when spliting. Inputs are connected
+ *	to original pins, output pins are set to NULL for later connecting
+ *	with temp pins to connect cascading multipliers/adders.
+ *---------------------------------------------------------------------*/
+void init_split_multiplier(nnode_t *node, nnode_t *ptr, int offa, int a, int offb, int b, nnode_t *node_a, nnode_t *node_b)
+{
+    /* Copy properties from original node */
+    ptr->type = node->type;
+    ptr->related_ast_node = node->related_ast_node;
+    ptr->traverse_visited = node->traverse_visited;
+    ptr->node_data = NULL;
+
+    /* Set new port sizes and parameters */
+    ptr->num_input_port_sizes = 2;
+    ptr->input_port_sizes = (int *)vtr::malloc(2 * sizeof(int));
+    ptr->input_port_sizes[0] = a;
+    ptr->input_port_sizes[1] = b;
+    ptr->num_output_port_sizes = 1;
+    ptr->output_port_sizes = (int *)vtr::malloc(sizeof(int));
+    ptr->output_port_sizes[0] = a + b;
+
+    /* Set the number of pins and re-locate previous pin entries */
+    ptr->num_input_pins = a + b;
+    ptr->input_pins = (npin_t **)vtr::malloc(sizeof(void *) * (a + b));
+    for (int i = 0; i < a; i++) {
+        if (node_a)
+            add_input_pin_to_node(ptr, copy_input_npin(node_a->input_pins[i]), i);
+        else
+            remap_pin_to_new_node(node->input_pins[i + offa], ptr, i);
+    }
+
+    for (int i = 0; i < b; i++) {
+        if (node_b)
+            add_input_pin_to_node(ptr, copy_input_npin(node_b->input_pins[i + node_b->input_port_sizes[0]]), i + a);
+        else
+            remap_pin_to_new_node(node->input_pins[i + node->input_port_sizes[0] + offb], ptr, i + a);
+    }
+
+    /* Prep output pins for connecting to cascaded multipliers */
+    ptr->num_output_pins = a + b;
+    ptr->output_pins = (npin_t **)vtr::malloc(sizeof(void *) * (a + b));
+    for (int i = 0; i < a + b; i++)
+        ptr->output_pins[i] = NULL;
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: init_multiplier_adder)
+ *
+ * This function is used to initialize an adder that is within
+ * a split multiplier or a multiplier addition tree.
+ *-----------------------------------------------------------------------*/
+void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b)
+{
+    node->type = ADD;
+    node->related_ast_node = parent->related_ast_node;
+    node->traverse_visited = parent->traverse_visited;
+    node->node_data = NULL;
+
+    /* Set size to be the maximum input size */
+    int size = (a < b) ? b : a;
+
+    /* Set new port sizes and parameters */
+    node->num_input_port_sizes = 2;
+    node->input_port_sizes = (int *)vtr::malloc(2 * sizeof(int));
+    node->input_port_sizes[0] = a;
+    node->input_port_sizes[1] = b;
+    node->num_output_port_sizes = 1;
+    node->output_port_sizes = (int *)vtr::malloc(sizeof(int));
+    node->output_port_sizes[0] = size;
+
+    /* Set the number of input pins and clear pin entries */
+    node->num_input_pins = a + b;
+    node->input_pins = (npin_t **)vtr::malloc(sizeof(void *) * (a + b));
+    for (int i = 0; i < a + b; i++)
+        node->input_pins[i] = NULL;
+
+    /* Set the number of output pins and clear pin entries */
+    node->num_output_pins = size;
+    node->output_pins = (npin_t **)vtr::malloc(sizeof(void *) * size);
+    for (int i = 0; i < size; i++)
+        node->output_pins[i] = NULL;
+
+    add_list = insert_in_vptr_list(add_list, node);
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_multiplier)
+ *
+ * This function works to split a multiplier into several smaller
+ *  multipliers to better "fit" with the available resources in a
+ *  targeted FPGA architecture.
+ *
+ * This function is at the lowest level since it simply receives
+ *  a multiplier and is told how to split it. The end result is:
+ *
+ *  a1a0 * b1b0 => a0 * b0 + a0 * b1 + a1 * b0 + a1 * b1 => c1c0 => c
+ *
+ * If we "balance" the additions, we can actually remove one of the
+ * addition operations since we know that a0 * b0 and a1 * b1 will
+ * not overlap in bits. This allows us to skip the addition between
+ * these two terms and simply concat the results together. Giving us
+ * the resulting logic:
+ *
+ * ((a1 * b1) . (a0 * b0)) + ((a0 * b1) + (a1 * b0)) ==> Result
+ *
+ * Note that for some of the additions we need to perform sign extensions,
+ * but this should not be a problem since the sign extension is always
+ * extending NOT contracting.
+ *
+ *-----------------------------------------------------------------------*/
+void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist)
+{
+    nnode_t *a0b0, *a0b1, *a1b0, *a1b1, *addsmall, *addbig;
+    int size;
+
+    /* Check for a legitimate split */
+    oassert(node->input_port_sizes[0] == (a0 + a1));
+    oassert(node->input_port_sizes[1] == (b0 + b1));
+
+    /* New node for small multiply */
+    a0b0 = allocate_nnode(node->loc);
+    a0b0->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a0b0->name, node->name);
+    strcat(a0b0->name, "-0");
+    init_split_multiplier(node, a0b0, 0, a0, 0, b0, nullptr, nullptr);
+    mult_list = insert_in_vptr_list(mult_list, a0b0);
+
+    /* New node for big multiply */
+    a1b1 = allocate_nnode(node->loc);
+    a1b1->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a1b1->name, node->name);
+    strcat(a1b1->name, "-3");
+    init_split_multiplier(node, a1b1, a0, a1, b0, b1, nullptr, nullptr);
+    mult_list = insert_in_vptr_list(mult_list, a1b1);
+
+    /* New node for 2nd multiply */
+    a0b1 = allocate_nnode(node->loc);
+    a0b1->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a0b1->name, node->name);
+    strcat(a0b1->name, "-1");
+    init_split_multiplier(node, a0b1, 0, a0, b0, b1, a0b0, a1b1);
+    mult_list = insert_in_vptr_list(mult_list, a0b1);
+
+    /* New node for 3rd multiply */
+    a1b0 = allocate_nnode(node->loc);
+    a1b0->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a1b0->name, node->name);
+    strcat(a1b0->name, "-2");
+    init_split_multiplier(node, a1b0, a0, a1, 0, b0, a1b1, a0b0);
+    mult_list = insert_in_vptr_list(mult_list, a1b0);
+
+    /* New node for the initial add */
+    addsmall = allocate_nnode(node->loc);
+    addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
+    strcpy(addsmall->name, node->name);
+    strcat(addsmall->name, "-add0");
+    // this addition will have a carry out in the worst case, add to input pins and connect then to gnd
+    init_multiplier_adder(addsmall, a1b0, a1b0->num_output_pins + 1, a0b1->num_output_pins + 1);
+
+    /* New node for the BIG add */
+    addbig = allocate_nnode(node->loc);
+    addbig->name = (char *)vtr::malloc(strlen(node->name) + 6);
+    strcpy(addbig->name, node->name);
+    strcat(addbig->name, "-add1");
+    init_multiplier_adder(addbig, addsmall, addsmall->num_output_pins, a0b0->num_output_pins - b0 + a1b1->num_output_pins);
+
+    // connect inputs to port a of addsmall
+    for (int i = 0; i < a1b0->num_output_pins; i++)
+        connect_nodes(a1b0, i, addsmall, i);
+    add_input_pin_to_node(addsmall, get_zero_pin(netlist), a1b0->num_output_pins);
+    // connect inputs to port b of addsmall
+    for (int i = 0; i < a0b1->num_output_pins; i++)
+        connect_nodes(a0b1, i, addsmall, i + addsmall->input_port_sizes[0]);
+    add_input_pin_to_node(addsmall, get_zero_pin(netlist), a0b1->num_output_pins + addsmall->input_port_sizes[0]);
+
+    // connect inputs to port a of addbig
+    size = addsmall->num_output_pins;
+    for (int i = 0; i < size; i++)
+        connect_nodes(addsmall, i, addbig, i);
+
+    // connect inputs to port b of addbig
+    for (int i = b0; i < a0b0->output_port_sizes[0]; i++)
+        connect_nodes(a0b0, i, addbig, i - b0 + size);
+    size = size + a0b0->output_port_sizes[0] - b0;
+    for (int i = 0; i < a1b1->output_port_sizes[0]; i++)
+        connect_nodes(a1b1, i, addbig, i + size);
+
+    // remap the multiplier outputs coming directly from a0b0
+    for (int i = 0; i < b0; i++) {
+        remap_pin_to_new_node(node->output_pins[i], a0b0, i);
+    }
+
+    // remap the multiplier outputs coming from addbig
+    for (int i = 0; i < addbig->num_output_pins; i++) {
+        remap_pin_to_new_node(node->output_pins[i + b0], addbig, i);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_multiplier_a)
+ *
+ * This function works to split the "a" input of a multiplier into
+ *  several smaller multipliers to better "fit" with the available
+ *  resources in a targeted FPGA architecture.
+ *
+ * This function is at the lowest level since it simply receives
+ *  a multiplier and is told how to split it. The end result is:
+ *
+ *  a1a0 * b => a0 * b + a1 * b => c
+ *
+ * Note that for the addition we need to perform sign extension,
+ * but this should not be a problem since the sign extension is always
+ * extending NOT contracting.
+ *
+ *-----------------------------------------------------------------------*/
+void split_multiplier_a(nnode_t *node, int a0, int a1, int b)
+{
+    nnode_t *a0b, *a1b, *addsmall;
+
+    /* Check for a legitimate split */
+    oassert(node->input_port_sizes[0] == (a0 + a1));
+    oassert(node->input_port_sizes[1] == b);
+
+    /* New node for a0b multiply */
+    a0b = allocate_nnode(node->loc);
+    a0b->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a0b->name, node->name);
+    strcat(a0b->name, "-0");
+    init_split_multiplier(node, a0b, 0, a0, 0, b, nullptr, nullptr);
+    mult_list = insert_in_vptr_list(mult_list, a0b);
+
+    /* New node for a1b multiply */
+    a1b = allocate_nnode(node->loc);
+    a1b->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(a1b->name, node->name);
+    strcat(a1b->name, "-1");
+    init_split_multiplier(node, a1b, a0, a1, 0, b, nullptr, a0b);
+    mult_list = insert_in_vptr_list(mult_list, a1b);
+
+    /* New node for the add */
+    addsmall = allocate_nnode(node->loc);
+    addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
+    strcpy(addsmall->name, node->name);
+    strcat(addsmall->name, "-add0");
+    init_multiplier_adder(addsmall, a0b, b, a1b->num_output_pins);
+
+    /* Connect pins for addsmall */
+    for (int i = a0; i < a0b->num_output_pins; i++)
+        connect_nodes(a0b, i, addsmall, i - a0);
+    for (int i = 0; i < a1b->num_output_pins; i++)
+        connect_nodes(a1b, i, addsmall, i + addsmall->input_port_sizes[0]);
+
+    /* Move original output pins for multiply to new outputs */
+    for (int i = 0; i < a0; i++)
+        remap_pin_to_new_node(node->output_pins[i], a0b, i);
+
+    for (int i = 0; i < addsmall->num_output_pins; i++)
+        remap_pin_to_new_node(node->output_pins[i + a0], addsmall, i);
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_multiplier_b)
+ *
+ * This function works to split the "b" input of a multiplier into
+ *  several smaller multipliers to better "fit" with the available
+ *  resources in a targeted FPGA architecture.
+ *
+ * This function is at the lowest level since it simply receives
+ *  a multiplier and is told how to split it. The end result is:
+ *
+ *  a * b1b0 => a * b1 + a * b0 => c
+ *
+ * Note that for the addition we need to perform sign extension,
+ * but this should not be a problem since the sign extension is always
+ * extending NOT contracting.
+ *
+ *-----------------------------------------------------------------------*/
+void split_multiplier_b(nnode_t *node, int a, int b1, int b0)
+{
+    nnode_t *ab0, *ab1, *addsmall;
+
+    /* Check for a legitimate split */
+    oassert(node->input_port_sizes[0] == a);
+    oassert(node->input_port_sizes[1] == (b0 + b1));
+
+    /* New node for ab0 multiply */
+    ab0 = allocate_nnode(node->loc);
+    ab0->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(ab0->name, node->name);
+    strcat(ab0->name, "-0");
+    init_split_multiplier(node, ab0, 0, a, 0, b0, nullptr, nullptr);
+    mult_list = insert_in_vptr_list(mult_list, ab0);
+
+    /* New node for ab1 multiply */
+    ab1 = allocate_nnode(node->loc);
+    ab1->name = (char *)vtr::malloc(strlen(node->name) + 3);
+    strcpy(ab1->name, node->name);
+    strcat(ab1->name, "-1");
+    init_split_multiplier(node, ab1, 0, a, b0, b1, ab0, nullptr);
+    mult_list = insert_in_vptr_list(mult_list, ab1);
+
+    /* New node for the add */
+    addsmall = allocate_nnode(node->loc);
+    addsmall->name = (char *)vtr::malloc(strlen(node->name) + 6);
+    strcpy(addsmall->name, node->name);
+    strcat(addsmall->name, "-add0");
+    init_multiplier_adder(addsmall, ab1, ab1->num_output_pins, a + b1);
+
+    /* Connect pins for addsmall */
+    for (int i = b0; i < ab0->output_port_sizes[0]; i++)
+        connect_nodes(ab0, i, addsmall, i - b0);
+    for (int i = ab0->output_port_sizes[0] - b0; i < a + b1; i++) /* Sign extend */
+        connect_nodes(ab0, ab0->output_port_sizes[0] - 1, addsmall, i);
+    for (int i = b1 + a; i < (2 * (a + b1)); i++)
+        connect_nodes(ab1, i - (b1 + a), addsmall, i);
+
+    /* Move original output pins for multiply to new outputs */
+    for (int i = 0; i < b0; i++)
+        remap_pin_to_new_node(node->output_pins[i], ab0, i);
+
+    for (int i = b0; i < node->num_output_pins; i++)
+        remap_pin_to_new_node(node->output_pins[i], addsmall, i - b0);
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: pad_multiplier)
+ *
+ * Fill out a multiplier to a fixed size. Size is retrieved from global
+ *	hard_multipliers data.
+ *
+ * NOTE: The inputs are extended based on multiplier padding setting.
+ *-----------------------------------------------------------------------*/
+void pad_multiplier(nnode_t *node, netlist_t *netlist)
+{
+    int diffa, diffb, diffout;
+    int sizea, sizeb, sizeout;
+    int ina, inb;
+
+    int testa, testb;
+
+    static int pad_pin_number = 0;
+
+    oassert(node->type == MULTIPLY);
+    oassert(hard_multipliers != NULL);
+
+    sizea = node->input_port_sizes[0];
+    sizeb = node->input_port_sizes[1];
+    sizeout = node->output_port_sizes[0];
+    record_mult_distribution(node);
+
+    /* Calculate the BEST fit hard multiplier to use */
+    ina = hard_multipliers->inputs->size;
+    inb = hard_multipliers->inputs->next->size;
+    if (ina < inb) {
+        ina = hard_multipliers->inputs->next->size;
+        inb = hard_multipliers->inputs->size;
+    }
+    diffa = ina - sizea;
+    diffb = inb - sizeb;
+    diffout = hard_multipliers->outputs->size - sizeout;
+
+    if (configuration.split_hard_multiplier == 1) {
+        t_linked_vptr *plist = hard_multipliers->pb_types;
+        while ((diffa + diffb) && plist) {
+            t_pb_type *physical = (t_pb_type *)(plist->data_vptr);
+            plist = plist->next;
+            testa = physical->ports[0].num_pins;
+            testb = physical->ports[1].num_pins;
+            if ((testa >= sizea) && (testb >= sizeb) && ((testa - sizea + testb - sizeb) < (diffa + diffb))) {
+                diffa = testa - sizea;
+                diffb = testb - sizeb;
+                diffout = physical->ports[2].num_pins - sizeout;
+            }
+        }
+    }
+
+    /* Expand the inputs */
+    if ((diffa != 0) || (diffb != 0)) {
+        allocate_more_input_pins(node, diffa + diffb);
+
+        /* Shift pins for expansion of first input pins */
+        if (diffa != 0) {
+            for (int i = 1; i <= sizeb; i++) {
+                move_input_pin(node, sizea + sizeb - i, node->num_input_pins - diffb - i);
+            }
+
+            /* Connect unused first input pins to zero/pad pin */
+            for (int i = 0; i < diffa; i++) {
+                if (configuration.mult_padding == 0)
+                    add_input_pin_to_node(node, get_zero_pin(netlist), i + sizea);
+                else
+                    add_input_pin_to_node(node, get_pad_pin(netlist), i + sizea);
+            }
+
+            node->input_port_sizes[0] = sizea + diffa;
+        }
+
+        if (diffb != 0) {
+            /* Connect unused second input pins to zero/pad pin */
+            for (int i = 1; i <= diffb; i++) {
+                if (configuration.mult_padding == 0)
+                    add_input_pin_to_node(node, get_zero_pin(netlist), node->num_input_pins - i);
+                else
+                    add_input_pin_to_node(node, get_pad_pin(netlist), node->num_input_pins - i);
+            }
+
+            node->input_port_sizes[1] = sizeb + diffb;
+        }
+    }
+
+    /* Expand the outputs */
+    if (diffout != 0) {
+        allocate_more_output_pins(node, diffout);
+        for (int i = 0; i < diffout; i++) {
+            // Add new pins to the higher order spots.
+            npin_t *new_pin = allocate_npin();
+            // Pad outputs with a unique and descriptive name to avoid collisions.
+            new_pin->name = append_string("", "unconnected_multiplier_output~%d", pad_pin_number++);
+            add_output_pin_to_node(node, new_pin, i + sizeout);
+        }
+        node->output_port_sizes[0] = sizeout + diffout;
+    }
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: iterate_multipliers)
+ *
+ * This function will iterate over all of the multiply operations that
+ *	exist in the netlist and perform a splitting so that they can
+ *	fit into a basic hard multiplier block that exists on the FPGA.
+ *	If the proper option is set, then it will be expanded as well
+ *	to just use a fixed size hard multiplier.
+ *-----------------------------------------------------------------------*/
+void iterate_multipliers(netlist_t *netlist)
+{
+    int sizea, sizeb, swap;
+    int mula, mulb;
+    int a0, a1, b0, b1;
+    nnode_t *node;
+
+    /* Can only perform the optimisation if hard multipliers exist! */
+    if (hard_multipliers == NULL)
+        return;
+
+    sizea = hard_multipliers->inputs->size;
+    sizeb = hard_multipliers->inputs->next->size;
+    if (sizea < sizeb) {
+        swap = sizea;
+        sizea = sizeb;
+        sizeb = swap;
+    }
+
+    while (mult_list != NULL) {
+        node = (nnode_t *)mult_list->data_vptr;
+        mult_list = delete_in_vptr_list(mult_list);
+
+        oassert(node != NULL);
+
+        if (node->type == HARD_IP)
+            node->type = MULTIPLY;
+
+        oassert(node->type == MULTIPLY);
+
+        mula = node->input_port_sizes[0];
+        mulb = node->input_port_sizes[1];
+        int mult_size = std::max<int>(mula, mulb);
+        if (mula < mulb) {
+            swap = sizea;
+            sizea = sizeb;
+            sizeb = swap;
+        }
+
+        /* Do I need to split the multiplier on both inputs? */
+        if ((mula > sizea) && (mulb > sizeb)) {
+            a0 = sizea;
+            a1 = mula - sizea;
+            b0 = sizeb;
+            b1 = mulb - sizeb;
+            split_multiplier(node, a0, b0, a1, b1, netlist);
+        } else if (mula > sizea) /* split multiplier on a input? */
+        {
+            a0 = sizea;
+            a1 = mula - sizea;
+            split_multiplier_a(node, a0, a1, mulb);
+        } else if (mulb > sizeb) /* split multiplier on b input? */
+        {
+            b1 = sizeb;
+            b0 = mulb - sizeb;
+            split_multiplier_b(node, mula, b1, b0);
+        }
+        // if either of the multiplicands is larger than the
+        // minimum hard multiplier size, use hard multiplier
+        // TODO: implement multipliers where one of the operands is
+        // 1 bit wide using soft logic
+        else if (mult_size >= min_mult || mula == 1 || mulb == 1) {
+            /* Check to ensure IF mult needs to be exact size */
+            if (configuration.fixed_hard_multiplier != 0)
+                pad_multiplier(node, netlist);
+
+            /* Otherwise, we still want to record the multiplier node for
+             * reporting later on (the pad_multiplier function does this for the
+             * other case */
+            else {
+                record_mult_distribution(node);
+            }
+        } else if (hard_adders) {
+            if (configuration.fixed_hard_multiplier != 0) {
+                split_soft_multiplier(node, netlist);
+            }
+        }
+    }
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: split_soft_multiplier)
+ *
+ * This function splits the input multiplier (node) into partial products (AND gates) and
+ * adders, as shown below. The partial products starts with "I", and all the partial products
+ * generated are added together by implementing a balanced adder tree to produce the final product
+ * Sample 4x4 multiplier to help understand logic:
+ *
+ * 	    				A3 	A2	A1	A0
+ *	    				B3 	B2 	B1 	B0
+ *	   	-------------------------------
+ *	    				I03	I02 I01 I00
+ *	   	+         	I13	I12	I11	I10
+ *	    		I23	I22	I21	I20             Level 0
+ *  	+	I23	I22	I21 I20
+ *      -------------------------------
+ *  		    C4	C3	C2	C1  C0
+ * 	+	D4  D3	D2	D1  D0	I20             Level 1
+ *  	-------------------------------
+ *  	E5	E4  E3  E2	E1	E0	C0  I00     Level 2
+ *
+ *-------------------------------------------------------------------------*/
+void split_soft_multiplier(nnode_t *node, netlist_t *netlist)
+{
+    oassert(node->num_output_pins > 0);
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->num_output_port_sizes == 1);
+
+    size_t multiplier_width = static_cast<size_t>(node->input_port_sizes[0]);
+    size_t multiplicand_width = static_cast<size_t>(node->input_port_sizes[1]);
+
+    // ODIN II doesn't work with multiplicand sizes of 1 since it assumes that the
+    // output of the multiplier is still the sum of the operands sizes. However, it
+    // should only be equal to the long operand since its an AND operation in this case.
+    // If this is fixed, this assert statement should be removed and the code will work properly
+    oassert(multiplicand_width > 1);
+
+    // number of adders in a balanced tree of the partial product rows
+    const int add_levels = std::ceil(std::log((double)multiplicand_width) / std::log(2.));
+
+    // data structure holding the rows of output pins to be added in each addition stage
+    // as well as the shift of each row from the position of the first output
+    std::vector<std::vector<AdderTreeRow>> addition_stages(add_levels + 1);
+    // 2-D array of adders, indexed by the level of the adder in the tree and the adder id within the level
+    std::vector<std::vector<nnode_t *>> adders(add_levels);
+    // array holding the adder width at each level in the adder tree
+    std::vector<std::vector<size_t>> adder_widths(add_levels);
+
+    // 2-D array of partial products. [0..multiplicand_width][0..multiplier_width]
+    std::vector<std::vector<nnode_t *>> partial_products(multiplicand_width);
+
+    addition_stages[0].resize(multiplicand_width);
+    // initialize all the AND gates needed for the partial products
+    for (size_t i = 0; i < multiplicand_width; i++) {
+        std::vector<std::pair<nnode_t *, int>> pp_bits(multiplier_width);
+        // resize the ith row of the partial products
+        partial_products[i].resize(multiplier_width);
+        for (size_t j = 0; j < multiplier_width; j++) {
+            // create each one of the partial products
+            partial_products[i][j] = make_1port_logic_gate(LOGICAL_AND, 2, node, node->traverse_visited);
+            pp_bits[j] = {partial_products[i][j], 0};
+        }
+        // add the partial product rows the addition stages data structure
+        addition_stages[0][i] = {i, pp_bits};
+    }
+
+    // generate the connections to the AND gates that generates the partial products of the multiplication
+    for (size_t i = 0; i < multiplicand_width; i++) {
+        for (size_t j = 0; j < multiplier_width; j++) {
+            // hookup the multiplier bits to the AND gates
+            if (i == 0) {
+                // when connecting the input to an AND gate for the first time, remap the input
+                remap_pin_to_new_node(node->input_pins[j], partial_products[i][j], 1);
+            } else {
+                // this input was remapped before, copy from the AND gate input instead
+                add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[0][j]->input_pins[1]), 1);
+            }
+            // hookup the input multiplicand bits the AND gates
+            if (j == 0) {
+                // when connecting the input to an AND gate for the first time, remap the input
+                remap_pin_to_new_node(node->input_pins[i + node->input_port_sizes[0]], partial_products[i][j], 0);
+            } else {
+                // this input was remapped before, copy from the AND gate input instead
+                add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[i][0]->input_pins[0]), 0);
+            }
+        }
+    }
+
+    // iterate over all the levels of addition
+    for (size_t level = 0; level < adders.size(); level++) {
+        // the number of rows in the next stage is the ceiling of number of rows in this stage divided by 2
+        addition_stages[level + 1].resize(std::ceil(addition_stages[level].size() / 2.));
+        // the number of adders in this stage is the integer division of the number of rows in this stage
+        adder_widths[level].resize(addition_stages[level].size() / 2);
+        adders[level].resize(addition_stages[level].size() / 2);
+
+        // iterate over every two rows
+        for (size_t row = 0; row < addition_stages[level].size() - 1; row += 2) {
+            auto &first_row = addition_stages[level][row];
+            auto &second_row = addition_stages[level][row + 1];
+            long shift_difference = second_row.shift - first_row.shift;
+            auto add_id = row / 2;
+
+            // get the widths of the adder, by finding the larger operand size
+            adder_widths[level][add_id] = std::max<size_t>(first_row.bits.size() - shift_difference, second_row.bits.size());
+            // first level of addition has a carry out that needs to be generated, so increase adder size by 1
+            if (level == 0)
+                adder_widths[level][add_id]++;
+            // add one bit for carry out if that last bit of the addition is fed by both levels
+            // (was found to be the only case were a carry out will be needed in this multiplier adder tree)
+            if (first_row.bits.size() - shift_difference == second_row.bits.size())
+                adder_widths[level][add_id]++;
+
+            // initialize this adder
+            adders[level][add_id] = allocate_nnode(node->loc);
+            init_multiplier_adder(adders[level][add_id], node, adder_widths[level][add_id], adder_widths[level][add_id]);
+            adders[level][add_id]->name = node_name(adders[level][add_id], node->name);
+
+            // initialize the output of this adder in the next stage
+            addition_stages[level + 1][add_id].shift = first_row.shift;
+            addition_stages[level + 1][add_id].bits.resize(shift_difference + adder_widths[level][add_id]);
+            // copy the bits that weren't fed to adders in the previous stage
+            for (size_t i = 0; (long)i < shift_difference; i++) {
+                addition_stages[level + 1][add_id].bits[i] = first_row.bits[i];
+            }
+            // copy adder output bits to their row in next stage
+            for (size_t i = 0; i < adder_widths[level][add_id]; i++) {
+                addition_stages[level + 1][add_id].bits[i + shift_difference] = {adders[level][add_id], i};
+            }
+
+            // connect the bits in the rows to the adder inputs.
+            for (size_t bit = 0; bit < adder_widths[level][add_id]; bit++) {
+                // input port a of the adder
+                if (bit < first_row.bits.size() - shift_difference) {
+                    auto bit_a = first_row.bits[bit + shift_difference];
+                    connect_nodes(bit_a.first, bit_a.second, adders[level][add_id], bit);
+                } else {
+                    // connect additional inputs to gnd
+                    add_input_pin_to_node(adders[level][add_id], get_zero_pin(netlist), bit);
+                }
+                // input port b of the adder
+                if (bit < second_row.bits.size()) {
+                    connect_nodes(second_row.bits[bit].first, second_row.bits[bit].second, adders[level][add_id], bit + adder_widths[level][add_id]);
+                } else {
+                    // connect additional inputs to gnd
+                    add_input_pin_to_node(adders[level][add_id], get_zero_pin(netlist), bit + adder_widths[level][add_id]);
+                }
+            }
+        }
+
+        // if this level have odd number of rows copy the last row to the next level to be added later
+        if (addition_stages[level].size() % 2 == 1) {
+            addition_stages[level + 1].back() = addition_stages[level].back();
+        }
+    }
+
+    // the size of the last stage of the adder tree should match the output size of the multiplier
+    oassert((long)addition_stages[add_levels][0].bits.size() == node->num_output_pins);
+
+    // Remap the outputs of the multiplier
+    for (size_t i = 0; i < addition_stages[add_levels][0].bits.size(); i++) {
+        auto output_bit = addition_stages[add_levels][0].bits[i];
+        remap_pin_to_new_node(node->output_pins[i], output_bit.first, output_bit.second);
+    }
+
+    // check that all connections and input/output remapping is done right
+    // meaning all the inputs and outputs of the multiplier that is splitted are nullptrs
+    // and all inputs and outputs of the AND gates and adders are not nullptrs
+
+    // check that all the inputs/outputs of the multiplier are remapped
+    for (long i = 0; i < node->num_input_pins; i++) {
+        oassert(!node->input_pins[i]);
+    }
+    for (long i = 0; i < node->num_output_pins; i++) {
+        oassert(!node->output_pins[i]);
+    }
+
+    // check that all the partial product gates have nets connected to their inputs/outputs
+    for (size_t ilevel = 0; ilevel < partial_products.size(); ilevel++) {
+        for (size_t depth = 0; depth < partial_products[ilevel].size(); depth++) {
+            for (int i = 0; i < partial_products[ilevel][depth]->num_input_pins; i++) {
+                oassert(partial_products[ilevel][depth]->input_pins[i]);
+            }
+            for (int i = 0; i < partial_products[ilevel][depth]->num_output_pins; i++) {
+                oassert(partial_products[ilevel][depth]->output_pins[i]);
+            }
+        }
+    }
+
+    // check that all adders have nets connected to their inputs/outputs
+    for (size_t ilevel = 0; ilevel < adders.size(); ilevel++) {
+        for (size_t iadd = 0; iadd < adders[ilevel].size(); iadd++) {
+            for (int i = 0; i < adders[ilevel][iadd]->num_input_pins; i++) {
+                oassert(adders[ilevel][iadd]->input_pins[i]);
+            }
+            for (int i = 0; i < adders[ilevel][iadd]->num_output_pins; i++) {
+                oassert(adders[ilevel][iadd]->output_pins[i]);
+            }
+        }
+    }
+
+    // CLEAN UP
+    cleanup_mult_old_node(node, netlist);
+}
+
+/**
+ * --------------------------------------------------------------------------
+ * (function: is_constant_multipication)
+ *
+ * @brief checking multipication ports to specify whether it
+ * is a constant multipication or not
+ *
+ * @param node pointer to the multipication netlist node
+ *
+ * @return multipication ports status
+ * -------------------------------------------------------------------------*/
+mult_port_stat_e is_constant_multipication(nnode_t *node, netlist_t *netlist)
+{
+    mult_port_stat_e is_const = mult_port_stat_e::mult_port_stat_END;
+
+    /**
+     * Multiply ports
+     * IN1: (n bits)        input_port[0]
+     * IN2: (m bits)        input_port[1]
+     * OUT: min(m, n) bits  output_port[0]
+     */
+
+    int IN1_width = node->input_port_sizes[0];
+    int IN2_width = node->input_port_sizes[1];
+
+    bool multiplier_const = true;
+    /* going through the IN1 port */
+    for (int i = 0; i < IN1_width; i++) {
+        /* corresponding pin of the port */
+        npin_t *pin = node->input_pins[i];
+        /* atleast equal to VCC or GND */
+        if (!strcmp(pin->net->name, netlist->zero_net->name) || !strcmp(pin->net->name, netlist->one_net->name))
+            continue;
+        else {
+            multiplier_const = false;
+            break;
+        }
+    }
+
+    bool multiplicand_const = true;
+    /* going through the IN1 port */
+    for (int i = 0; i < IN2_width; i++) {
+        /* corresponding pin of the port */
+        npin_t *pin = node->input_pins[IN1_width + i];
+        /* atleast equal to VCC or GND */
+        if (!strcmp(pin->net->name, netlist->zero_net->name) || !strcmp(pin->net->name, netlist->one_net->name))
+            continue;
+        else {
+            multiplicand_const = false;
+            break;
+        }
+    }
+
+    if (multiplier_const && multiplicand_const)
+        is_const = mult_port_stat_e::CONSTANT;
+    else if (multiplier_const)
+        is_const = mult_port_stat_e::MULTIPLIER_CONSTANT;
+    else if (multiplicand_const)
+        is_const = mult_port_stat_e::MULTIPICAND_CONSTANT;
+    else
+        is_const = mult_port_stat_e::NOT_CONSTANT;
+
+    return (is_const);
+}
+
+/**
+ *-------------------------------------------------------------------------------------------
+ * (function: check_constant_multipication )
+ *
+ * @brief checking for constant multipication. If one port is constant,
+ * the multipication node will explode into multiple adders
+ *
+ * @param node pointing to the mul node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ *-----------------------------------------------------------------------------------------*/
+bool check_constant_multipication(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    /* to calculate return value */
+    mult_port_stat_e is_const;
+
+    /* checking multipication ports to specify whether it is constant or not */
+    if ((is_const = is_constant_multipication(node, netlist)) != mult_port_stat_e::NOT_CONSTANT) {
+        /* performaing optimization on the constant multiplication ports */
+        node = perform_const_mult_optimization(is_const, node, traverse_mark_number, netlist);
+        /* implementation of constant multipication which is actually cascading adders */
+        signal_list_t *output_signals = implement_constant_multipication(node, is_const, static_cast<short>(traverse_mark_number), netlist);
+
+        /* connecting the output pins */
+        connect_constant_mult_outputs(node, output_signals);
+    }
+
+    return (is_const != mult_port_stat_e::NOT_CONSTANT);
+}
+
+/**
+ *-------------------------------------------------------------------------------------------
+ * (function: perform_const_mult_optimization )
+ *
+ * @brief checking for constant multipication constant port size.
+ * if possible the extra unneccessary pins of constant port will
+ * be reduced
+ *
+ * @param node pointing to the mul node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ *-----------------------------------------------------------------------------------------*/
+static nnode_t *perform_const_mult_optimization(mult_port_stat_e mult_port_stat, nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    /* constatnt and variable port of the given multipication */
+    signal_list_t *const_port = init_signal_list();
+    signal_list_t *var_port = init_signal_list();
+    operation_list const_signedness = UNSIGNED;
+    operation_list var_signedness = UNSIGNED;
+
+    /* initialize const and var port signals */
+    if (mult_port_stat == mult_port_stat_e::MULTIPICAND_CONSTANT) {
+        /* adding var port pins to signal list */
+        for (int i = 0; i < node->input_port_sizes[0]; i++) {
+            add_pin_to_signal_list(var_port, node->input_pins[i]);
+        }
+        var_signedness = node->attributes->port_a_signed;
+        /* adding const port pins to signal list */
+        for (int i = node->input_port_sizes[0]; i < node->num_input_pins; i++) {
+            add_pin_to_signal_list(const_port, node->input_pins[i]);
+        }
+        const_signedness = node->attributes->port_b_signed;
+    } else if (mult_port_stat == mult_port_stat_e::MULTIPLIER_CONSTANT) {
+        /* adding var port pins to signal list */
+        for (int i = 0; i < node->input_port_sizes[0]; i++) {
+            add_pin_to_signal_list(const_port, node->input_pins[i]);
+        }
+        const_signedness = node->attributes->port_a_signed;
+        /* adding const port pins to signal list */
+        for (int i = node->input_port_sizes[0]; i < node->num_input_pins; i++) {
+            add_pin_to_signal_list(var_port, node->input_pins[i]);
+        }
+        var_signedness = node->attributes->port_b_signed;
+    }
+
+    int idx = -1;
+    signal_list_t *new_const_port = init_signal_list();
+    /* iterating over const port to determine useless ports */
+    for (int i = const_port->count; i > 0; i--) {
+        npin_t *pin = const_port->pins[i - 1];
+        /* starting from the end and prune pins connected to GND */
+        if (!strcmp(pin->net->name, netlist->one_net->name)) {
+            idx = i;
+            break;
+        } else {
+            /* detach from the old mult node and free pin */
+            delete_npin(pin);
+        }
+    }
+    /* initializing new const port */
+    for (int i = 0; i < idx; i++) {
+        npin_t *pin = const_port->pins[i];
+        add_pin_to_signal_list(new_const_port, pin);
+    }
+
+    signal_list_t *first_port = (mult_port_stat == mult_port_stat_e::MULTIPLIER_CONSTANT) ? new_const_port : var_port;
+    signal_list_t *second_port = (mult_port_stat == mult_port_stat_e::MULTIPLIER_CONSTANT) ? var_port : new_const_port;
+    /* creating new mult node */
+    int offset = 0;
+    nnode_t *new_node = make_2port_gate(node->type, first_port->count, second_port->count, node->num_output_pins, node, traverse_mark_number);
+    /* copy attributes */
+    if (mult_port_stat == mult_port_stat_e::MULTIPLIER_CONSTANT) {
+        new_node->attributes->port_a_signed = const_signedness;
+        new_node->attributes->port_b_signed = var_signedness;
+    } else {
+        new_node->attributes->port_a_signed = var_signedness;
+        new_node->attributes->port_b_signed = const_signedness;
+    }
+    /* adding first port */
+    for (int i = 0; i < first_port->count; i++) {
+        remap_pin_to_new_node(first_port->pins[i], new_node, offset + i);
+    }
+    offset += first_port->count;
+    /* adding second port */
+    for (int i = 0; i < second_port->count; i++) {
+        remap_pin_to_new_node(second_port->pins[i], new_node, offset + i);
+    }
+    /* remap output ports */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        remap_pin_to_new_node(node->output_pins[i], new_node, i);
+    }
+
+    // CLEAN UP
+    free_signal_list(const_port);
+    free_signal_list(var_port);
+    free_signal_list(new_const_port);
+    free_nnode(node);
+    node = NULL;
+
+    return (new_node);
+}
+
+/**
+ * -------------------------------------------------------------------------
+ * (function: check_multiplier_port_size)
+ *
+ * If output size is less than the sum of input sizes,
+ * we need to expand output pins with pad pins
+ *
+ * @param node pointer to the multiplication node
+ * -----------------------------------------------------------------------
+ */
+void check_multiplier_port_size(nnode_t *node)
+{
+    /* Can only perform the optimisation if hard multipliers exist! */
+    if (hard_multipliers == NULL)
+        return;
+
+    int mula = node->input_port_sizes[0];
+    int mulb = node->input_port_sizes[1];
+    int sizeout = node->num_output_pins;
+    int limit = mula + mulb;
+
+    /* check the output port size */
+    if (node->num_output_pins < limit) {
+        // Set the limit value as the number of output pins
+        node->num_output_pins = limit;
+        node->output_port_sizes[0] = limit;
+        // Keep record of old output pins pointer for cleaning up later
+        npin_t **old_output_pins = node->output_pins;
+        node->output_pins = (npin_t **)calloc(node->num_output_pins, sizeof(npin_t *));
+
+        // Move output pins to new array and adding pad pins in extra spots
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (i < sizeout)
+                node->output_pins[i] = old_output_pins[i];
+            else {
+                npin_t *new_pin = allocate_npin();
+                new_pin->name = append_string("", "%s~dummy_output~%d", node->name, 0);
+                nnet_t *new_net = allocate_nnet();
+
+                // hook the output pin into the node
+                add_output_pin_to_node(node, new_pin, i);
+                // hook up new pin 1 into the new net
+                add_driver_pin_to_net(new_net, new_pin);
+            }
+        }
+        // CLEAN UP
+        vtr::free(old_output_pins);
+    }
+}
+/*-------------------------------------------------------------------------
+ * (function: clean_multipliers)
+ *
+ * Clean up the memory by deleting the list structure of multipliers
+ *	during optimization
+ *-----------------------------------------------------------------------*/
+void clean_multipliers()
+{
+    while (mult_list != NULL)
+        mult_list = delete_in_vptr_list(mult_list);
+    return;
+}
+
+/**
+ * -------------------------------------------------------------------------
+ * (function: cleanup_mult_old_node)
+ *
+ * @brief <clean up nodeo, a high level MULT node>
+ * In split_soft_multplier function, nodeo is splitted to small multipliers,
+ * while because of the complexity of input pin connections they have not been
+ * remapped to new nodes, they just copied and added to new nodes. This function
+ * will detach input pins from the nodeo. Moreover, it will connect the net of
+ * unconnected output signals to the GND node, detach the pin from nodeo and
+ * free the output pins to avoid memory leak.
+ *
+ * @param nodeo representing the old adder node
+ * @param netlist representing the current netlist
+ *-----------------------------------------------------------------------*/
+static void cleanup_mult_old_node(nnode_t *nodeo, netlist_t *netlist)
+{
+    /* Disconnecting input pins from the old node side */
+    for (int i = 0; i < nodeo->num_input_pins; i++) {
+        nodeo->input_pins[i] = NULL;
+    }
+
+    /* connecting the extra output pins to the gnd node */
+    for (int i = 0; i < nodeo->num_output_pins; i++) {
+        npin_t *output_pin = nodeo->output_pins[i];
+
+        if (output_pin && output_pin->node) {
+            /* for now we just pass the signals directly through */
+            npin_t *zero_pin = get_zero_pin(netlist);
+            int idx_2_buffer = zero_pin->pin_net_idx;
+
+            // Dont eliminate the buffer if there are multiple drivers or the AST included it
+            if (output_pin->net->num_driver_pins <= 1) {
+                /* join all fanouts of the output net with the input pins net */
+                join_nets(zero_pin->net, output_pin->net);
+
+                /* erase the pointer to this buffer */
+                zero_pin->net->fanout_pins[idx_2_buffer] = NULL;
+            }
+
+            free_npin(zero_pin);
+            free_npin(output_pin);
+
+            /* Disconnecting output pins from the old node side */
+            nodeo->output_pins[i] = NULL;
+        }
+    }
+
+    // CLEAN UP
+    free_nnode(nodeo);
+}
+
+void free_multipliers()
+{
+    if (hard_multipliers && hard_multipliers->instances) {
+        t_multiplier *tmp = (t_multiplier *)hard_multipliers->instances;
+
+        while (tmp != NULL) {
+            t_multiplier *tmp2 = tmp->next;
+            vtr::free(tmp);
+            tmp = tmp2;
+        }
+
+        hard_multipliers->instances = NULL;
+    }
+}
diff --git a/parmys-plugin/core/multiplier.h b/parmys-plugin/core/multiplier.h
new file mode 100644
index 000000000..5f9fd4e16
--- /dev/null
+++ b/parmys-plugin/core/multiplier.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _MULTIPLIER_H_
+#define _MULTIPLIER_H_
+
+#include "odin_types.h"
+#include "read_xml_arch_file.h"
+
+struct t_multiplier {
+    int size_a;
+    int size_b;
+    int size_out;
+    struct t_multiplier *next;
+};
+
+enum class mult_port_stat_e {
+    NOT_CONSTANT,         // neither of ports are constant
+    MULTIPLIER_CONSTANT,  // first input port is constant
+    MULTIPICAND_CONSTANT, // second input port is constant
+    CONSTANT,             // both input ports are constant
+    mult_port_stat_END
+};
+
+extern t_model *hard_multipliers;
+extern vtr::t_linked_vptr *mult_list;
+extern int min_mult;
+
+extern void init_mult_distribution();
+extern void report_mult_distribution();
+extern void declare_hard_multiplier(nnode_t *node);
+extern void instantiate_hard_multiplier(nnode_t *node, short mark, netlist_t *netlist);
+extern void instantiate_simple_soft_multiplier(nnode_t *node, short mark, netlist_t *netlist);
+extern void connect_constant_mult_outputs(nnode_t *node, signal_list_t *output_signal_list);
+extern void find_hard_multipliers();
+extern void add_the_blackbox_for_mults_yosys(Yosys::Design *design);
+extern void define_mult_function_yosys(nnode_t *node, Yosys::Module *module, Yosys::Design *design);
+extern void split_multiplier(nnode_t *node, int a0, int b0, int a1, int b1, netlist_t *netlist);
+extern void iterate_multipliers(netlist_t *netlist);
+extern bool check_constant_multipication(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void check_multiplier_port_size(nnode_t *node);
+extern void clean_multipliers();
+extern void free_multipliers();
+
+#endif // _MULTIPLIER_H_
diff --git a/parmys-plugin/core/subtractor.cc b/parmys-plugin/core/subtractor.cc
new file mode 100644
index 000000000..03c259f95
--- /dev/null
+++ b/parmys-plugin/core/subtractor.cc
@@ -0,0 +1,884 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "subtractor.h"
+#include "adder.h"
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include <stdio.h>
+#include <string.h>
+
+#include "vtr_memory.h"
+
+using vtr::t_linked_vptr;
+
+USING_YOSYS_NAMESPACE
+
+t_linked_vptr *sub_list = NULL;
+t_linked_vptr *sub_chain_list = NULL;
+int subchaintotal = 0;
+int *sub = NULL;
+
+void init_split_adder_for_sub(nnode_t *node, nnode_t *ptr, int a, int sizea, int b, int sizeb, int cin, int cout, int index, int flag);
+static void cleanup_sub_old_node(nnode_t *nodeo, netlist_t *netlist);
+
+/*---------------------------------------------------------------------------
+ * (function: report_sub_distribution)
+ *-------------------------------------------------------------------------*/
+
+/* These values are collected during the unused logic removal sweep */
+extern long subtractor_chain_count;
+extern long longest_subtractor_chain;
+extern long total_subtractors;
+
+void report_sub_distribution()
+{
+    if (hard_adders == NULL)
+        return;
+
+    log("\nHard MINUS Distribution\n");
+    log("============================\n");
+    log("\n");
+    log("\nTotal # of chains = %ld\n", subtractor_chain_count);
+
+    log("\nHard sub chain Details\n");
+    log("============================\n");
+
+    log("\n");
+    log("\nThe Number of Hard Block subs in the Longest Chain: %ld\n", longest_subtractor_chain);
+
+    log("\n");
+    log("\nThe Total Number of Hard Block subs: %ld\n", total_subtractors);
+
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: declare_hard_adder_for_sub)
+ *-------------------------------------------------------------------------*/
+void declare_hard_adder_for_sub(nnode_t *node)
+{
+    t_adder *tmp;
+    int width_a, width_b, width_sumout;
+
+    /* See if this size instance of adder exists?*/
+    if (hard_adders == NULL) {
+        warning_message(NETLIST, node->loc, "%s\n", "Instantiating Substraction where hard adders do not exist");
+    }
+    tmp = (t_adder *)hard_adders->instances;
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width_sumout = node->output_port_sizes[1];
+
+    while (tmp != NULL) {
+        if ((tmp->size_a == width_a) && (tmp->size_b == width_b) && (tmp->size_sumout == width_sumout))
+            return;
+        else
+            tmp = tmp->next;
+    }
+
+    /* Does not exist - must create an instance*/
+    tmp = (t_adder *)vtr::malloc(sizeof(t_adder));
+    tmp->next = (t_adder *)hard_adders->instances;
+    hard_adders->instances = tmp;
+    tmp->size_a = width_a;
+    tmp->size_b = width_b;
+    tmp->size_cin = 1;
+    tmp->size_cout = 1;
+    tmp->size_sumout = width_sumout;
+    return;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: instantiate_hard_adder_subtraction )
+ *-------------------------------------------------------------------------*/
+void instantiate_hard_adder_subtraction(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    char *new_name = NULL;
+    int len, sanity;
+
+    declare_hard_adder_for_sub(node);
+
+    /* Need to give node proper name */
+    len = strlen(node->name);
+    len = len + 20; /* 20 chars should hold mul specs */
+    new_name = (char *)vtr::malloc(len);
+
+    /* wide input first :) identical branching! ? */
+    // if (node->input_port_sizes[0] > node->input_port_sizes[1])
+    sanity = odin_sprintf(new_name, "%s", node->name);
+    // else
+    // 	sanity = odin_sprintf(new_name, "%s", node->name);
+
+    if (new_name) {
+        vtr::free(new_name);
+    }
+
+    if (len <= sanity) /* buffer not large enough */
+        oassert(false);
+
+    /* Give names to the output pins */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        if (node->output_pins[i]->name == NULL) {
+            len = strlen(node->name) + 20; /* 6 chars for pin idx */
+            new_name = (char *)vtr::malloc(len);
+            odin_sprintf(new_name, "%s[%d]", node->name, node->output_pins[i]->pin_node_idx);
+            node->output_pins[i]->name = new_name;
+        }
+    }
+
+    node->traverse_visited = mark;
+    return;
+}
+
+/*-----------------------------------------------------------------------
+ * (function: init_split_adder)
+ *	Create a carry chain adder when spliting. Inputs are connected
+ *	to original pins, output pins are set to NULL for later connecting
+ *---------------------------------------------------------------------*/
+void init_split_adder_for_sub(nnode_t *node, nnode_t *ptr, int a, int sizea, int b, int sizeb, int cin, int cout, int index, int flag)
+{
+    int flaga = 0;
+    int current_sizea, current_sizeb;
+    int aa = 0;
+    int num = 0;
+
+    // if the input of the first cin is generated by a dummy adder added
+    // to the start of the chain, then an offset is needed to compensate
+    // for that in various positions in the code, otherwise the offset is 0
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Copy properties from original node */
+    ptr->type = node->type;
+    ptr->related_ast_node = node->related_ast_node;
+    ptr->traverse_visited = node->traverse_visited;
+    ptr->node_data = NULL;
+
+    /* decide the current size of input a and b */
+    if (flag == 0) {
+        current_sizea = (a + offset) - sizea * index;
+        current_sizeb = (b + offset) - sizeb * index;
+        if (current_sizea >= sizea)
+            current_sizea = sizea;
+        else if (current_sizea <= 0) {
+            current_sizea = sizea;
+            flaga = 1;
+        } else {
+            aa = current_sizea;
+            current_sizea = sizea;
+            flaga = 2;
+        }
+        current_sizeb = sizeb;
+    } else {
+        if (sizea != 0)
+            current_sizea = sizea;
+        else
+            current_sizea = 1;
+        if (sizeb != 0)
+            current_sizeb = sizeb;
+        else
+            current_sizeb = 1;
+    }
+
+    /* Set new port sizes and parameters */
+    ptr->num_input_port_sizes = 3;
+    ptr->input_port_sizes = (int *)vtr::malloc(3 * sizeof(int));
+    ptr->input_port_sizes[0] = current_sizea;
+    ptr->input_port_sizes[1] = current_sizeb;
+    ptr->input_port_sizes[2] = cin;
+    ptr->num_output_port_sizes = 2;
+    ptr->output_port_sizes = (int *)vtr::malloc(2 * sizeof(int));
+    ptr->output_port_sizes[0] = cout;
+
+    /* The size of output port sumout equals the maxim size of a and b  */
+    if (current_sizea > current_sizeb)
+        ptr->output_port_sizes[1] = current_sizea;
+    else
+        ptr->output_port_sizes[1] = current_sizeb;
+
+    /* Set the number of pins and re-locate previous pin entries */
+    ptr->num_input_pins = current_sizea + current_sizeb + cin;
+    ptr->input_pins = (npin_t **)vtr::malloc(sizeof(void *) * (current_sizea + current_sizeb + cin));
+    // the normal sub: if flaga or flagb = 1, the input pins should be empty.
+    // the unary sub: all input pins for a should be null, input pins for b should be connected to node
+    if (node->num_input_port_sizes == 1) {
+        for (int i = 0; i < current_sizea; i++)
+            ptr->input_pins[i] = NULL;
+    } else if ((flaga == 1) && (node->num_input_port_sizes == 2)) {
+        for (int i = 0; i < current_sizea; i++)
+            ptr->input_pins[i] = NULL;
+    } else if ((flaga == 2) && (node->num_input_port_sizes == 2)) {
+        if (index == 0) {
+            ptr->input_pins[0] = NULL;
+            if (sizea > 1) {
+                for (int i = 1; i < aa; i++) {
+                    ptr->input_pins[i] = node->input_pins[i + index * sizea - 1];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+                for (int i = 0; i < (sizea - aa); i++)
+                    ptr->input_pins[i + aa] = NULL;
+            }
+        } else {
+            for (int i = 0; i < aa; i++) {
+                ptr->input_pins[i] = node->input_pins[i + index * sizea - 1];
+                ptr->input_pins[i]->node = ptr;
+                ptr->input_pins[i]->pin_node_idx = i;
+            }
+            for (int i = 0; i < (sizea - aa); i++)
+                ptr->input_pins[i + aa] = NULL;
+        }
+    } else {
+        if (index == 0 && !configuration.adder_cin_global) {
+            if (flag == 0) {
+                ptr->input_pins[0] = NULL;
+                if (sizea > 1) {
+                    for (int i = 1; i < sizea; i++) {
+                        ptr->input_pins[i] = node->input_pins[i + index * sizea - 1];
+                        ptr->input_pins[i]->node = ptr;
+                        ptr->input_pins[i]->pin_node_idx = i;
+                    }
+                }
+            } else {
+                for (int i = 0; i < current_sizea; i++) {
+                    ptr->input_pins[i] = node->input_pins[i];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+            }
+        } else {
+            if (flag == 0) {
+                for (int i = 0; i < sizea; i++) {
+                    ptr->input_pins[i] = node->input_pins[i + index * sizea - offset];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+            } else {
+                num = node->input_port_sizes[0];
+                for (int i = 0; i < current_sizea; i++) {
+                    ptr->input_pins[i] = node->input_pins[i + num - current_sizea];
+                    ptr->input_pins[i]->node = ptr;
+                    ptr->input_pins[i]->pin_node_idx = i;
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < current_sizeb; i++)
+        ptr->input_pins[i + current_sizeb] = NULL;
+
+    /* Carry_in should be NULL*/
+    for (int i = 0; i < cin; i++) {
+        ptr->input_pins[i + current_sizea + current_sizeb] = NULL;
+    }
+
+    /* output pins */
+    int output;
+    if (current_sizea > current_sizeb)
+        output = current_sizea + cout;
+    else
+        output = current_sizeb + cout;
+
+    ptr->num_output_pins = output;
+    ptr->output_pins = (npin_t **)vtr::malloc(sizeof(void *) * output);
+    for (int i = 0; i < output; i++)
+        ptr->output_pins[i] = NULL;
+
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: split_adder)
+ *
+ * This function works to split a adder into several smaller
+ *  adders to better "fit" with the available resources in a
+ *  targeted FPGA architecture.
+ *
+ * This function is at the lowest level since it simply receives
+ *  a adder and is told how to split it.
+ *
+ * Note that for some of the additions we need to perform sign extensions,
+ * but this should not be a problem since the sign extension is always
+ * extending NOT contracting.
+ *-----------------------------------------------------------------------*/
+
+void split_adder_for_sub(nnode_t *nodeo, int a, int b, int sizea, int sizeb, int cin, int cout, int count, netlist_t *netlist)
+{
+    nnode_t **node;
+    nnode_t **not_node;
+    int num;
+    int max_num = 0;
+    int flag = 0, lefta = 0, leftb = 0;
+
+    // if the input of the first cin is generated by a dummy adder added
+    // to the start of the chain, then an offset is needed to compensate
+    // for that in various positions in the code, otherwise the offset is 0
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Check for a legitimate split */
+    if (nodeo->num_input_port_sizes == 2) {
+        oassert(nodeo->input_port_sizes[0] == a);
+        oassert(nodeo->input_port_sizes[1] == b);
+    } else {
+        oassert(nodeo->input_port_sizes[0] == a);
+        oassert(nodeo->input_port_sizes[0] == b);
+    }
+
+    node = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * (count));
+    not_node = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * (b));
+
+    for (int i = 0; i < b; i++) {
+        not_node[i] = allocate_nnode(nodeo->loc);
+        nnode_t *temp = not_node[i];
+        if (nodeo->num_input_port_sizes == 2)
+            not_node[i] = make_not_gate_with_input(copy_input_npin(nodeo->input_pins[a + i]), not_node[i], -1);
+        else
+            not_node[i] = make_not_gate_with_input(copy_input_npin(nodeo->input_pins[i]), not_node[i], -1);
+        free_nnode(temp);
+    }
+
+    for (int i = 0; i < count; i++) {
+        node[i] = allocate_nnode(nodeo->loc);
+        node[i]->name = (char *)vtr::malloc(strlen(nodeo->name) + 20);
+        odin_sprintf(node[i]->name, "%s-%d", nodeo->name, i);
+        if (i == count - 1) {
+            if (configuration.fixed_hard_adder == 1)
+                init_split_adder_for_sub(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag);
+            else {
+                if (count == 1) {
+                    lefta = a;
+                    leftb = b;
+                } else {
+                    lefta = (a + 1) % sizea;
+                    leftb = (b + 1) % sizeb;
+                }
+
+                max_num = (lefta >= leftb) ? lefta : leftb;
+                // if fixed_hard_adder = 0, and the left of a and b is more than min_add, then adder need to be remain the same size.
+                if (max_num >= min_add || lefta + leftb == 0)
+                    init_split_adder_for_sub(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag);
+                else {
+                    // Using soft logic to do the addition, No need to pad as the same size
+                    flag = 1;
+                    init_split_adder_for_sub(nodeo, node[i], a, lefta, b, leftb, cin, cout, i, flag);
+                }
+            }
+        } else
+            init_split_adder_for_sub(nodeo, node[i], a, sizea, b, sizeb, cin, cout, i, flag);
+
+        // store the processed hard adder node for optimization
+        processed_adder_list = insert_in_vptr_list(processed_adder_list, node[i]);
+    }
+
+    chain_information_t *adder_chain = allocate_chain_info();
+    // if flag = 0, the last adder use soft logic, so the count of the chain should be one less
+    if (flag == 0)
+        adder_chain->count = count;
+    else
+        adder_chain->count = count - 1;
+    adder_chain->num_bits = a + b;
+    adder_chain->name = nodeo->name;
+    sub_chain_list = insert_in_vptr_list(sub_chain_list, adder_chain);
+
+    if (flag == 1 && count == 1) {
+        for (int i = 0; i < b; i++) {
+            /* If the input pin of not gate connects to gnd, replacing the input pin and the not gate with vcc;
+             * if the input pin of not gate connects to vcc, replacing the input pin and the not gate with gnd.*/
+            /* connecting untouched nets in the netlist creation to the pad node */
+            if (not_node[i]->input_pins[0]->net->num_driver_pins == 0) {
+                /* join untouched net with pad net */
+                join_nets(netlist->pad_net, not_node[i]->input_pins[0]->net);
+            }
+            oassert(not_node[i]->input_pins[0]->net->num_driver_pins == 1);
+            if (not_node[i]->input_pins[0]->net->driver_pins[0]->node->type == GND_NODE) {
+                connect_nodes(netlist->vcc_node, 0, node[0], (lefta + i));
+                remove_fanout_pins_from_net(not_node[i]->input_pins[0]->net, not_node[i]->input_pins[0], not_node[i]->input_pins[0]->pin_net_idx);
+                free_nnode(not_node[i]);
+            } else if (not_node[i]->input_pins[0]->net->driver_pins[0]->node->type == VCC_NODE) {
+                connect_nodes(netlist->gnd_node, 0, node[0], (lefta + i));
+                remove_fanout_pins_from_net(not_node[i]->input_pins[0]->net, not_node[i]->input_pins[0], not_node[i]->input_pins[0]->pin_net_idx);
+                free_nnode(not_node[i]);
+            } else
+                connect_nodes(not_node[i], 0, node[0], (lefta + i));
+        }
+    } else {
+        if (sizeb > 1) {
+            if ((b + 1) < sizeb)
+                num = b;
+            else
+                num = sizeb - 1;
+            for (int i = 0; i < num; i++) {
+                /* If the input pin of not gate connects to gnd, replacing the input pin and the not gate with vcc;
+                 * if the input pin of not gate connects to vcc, replacing the input pin and the not gate with gnd.*/
+                /* connecting untouched nets in the netlist creation to the pad node */
+                if (not_node[i]->input_pins[0]->net->num_driver_pins == 0) {
+                    /* join untouched net with pad net */
+                    join_nets(netlist->pad_net, not_node[i]->input_pins[0]->net);
+                }
+                oassert(not_node[i]->input_pins[0]->net->num_driver_pins == 1);
+                if (not_node[i]->input_pins[0]->net->driver_pins[0]->node->type == GND_NODE) {
+                    connect_nodes(netlist->vcc_node, 0, node[0], (sizea + i + 1));
+                    remove_fanout_pins_from_net(not_node[i]->input_pins[0]->net, not_node[i]->input_pins[0], not_node[i]->input_pins[0]->pin_net_idx);
+                    free_nnode(not_node[i]);
+                } else if (not_node[i]->input_pins[0]->net->driver_pins[0]->node->type == VCC_NODE) {
+                    connect_nodes(netlist->gnd_node, 0, node[0], (sizea + i + 1));
+                    remove_fanout_pins_from_net(not_node[i]->input_pins[0]->net, not_node[i]->input_pins[0], not_node[i]->input_pins[0]->pin_net_idx);
+                    free_nnode(not_node[i]);
+                } else
+                    connect_nodes(not_node[i], 0, node[0], (sizea + i + 1));
+            }
+        }
+
+        for (int i = offset; i < count; i++) {
+            num = (b + 1) - i * sizeb;
+            if (num > sizeb)
+                num = sizeb;
+
+            for (int j = 0; j < num; j++) {
+                if (i == count - 1 && flag == 1) {
+                    /* If the input pin of not gate connects to gnd, replacing the input pin and the not gate with vcc;
+                     * if the input pin of not gate connects to vcc, replacing the input pin and the not gate with gnd.*/
+                    /* connecting untouched nets in the netlist creation to the pad node */
+                    if (not_node[(i * sizeb + j - 1)]->input_pins[0]->net->num_driver_pins == 0) {
+                        join_nets(netlist->pad_net, not_node[(i * sizeb + j - 1)]->input_pins[0]->net);
+                    }
+                    oassert(not_node[(i * sizeb + j - 1)]->input_pins[0]->net->num_driver_pins == 1);
+                    if (not_node[(i * sizeb + j - 1)]->input_pins[0]->net->driver_pins[0]->node->type == GND_NODE) {
+                        connect_nodes(netlist->vcc_node, 0, node[i], (lefta + j));
+                        remove_fanout_pins_from_net(not_node[(i * sizeb + j - 1)]->input_pins[0]->net, not_node[(i * sizeb + j - 1)]->input_pins[0],
+                                                    not_node[(i * sizeb + j - 1)]->input_pins[0]->pin_net_idx);
+                        free_nnode(not_node[(i * sizeb + j - 1)]);
+                    } else if (not_node[(i * sizeb + j - 1)]->input_pins[0]->net->driver_pins[0]->node->type == VCC_NODE) {
+                        connect_nodes(netlist->gnd_node, 0, node[i], (lefta + j));
+                        remove_fanout_pins_from_net(not_node[(i * sizeb + j - 1)]->input_pins[0]->net, not_node[(i * sizeb + j - 1)]->input_pins[0],
+                                                    not_node[(i * sizeb + j - 1)]->input_pins[0]->pin_net_idx);
+                        free_nnode(not_node[(i * sizeb + j - 1)]);
+                    } else
+                        connect_nodes(not_node[(i * sizeb + j - 1)], 0, node[i], (lefta + j));
+                } else {
+                    /* If the input pin of not gate connects to gnd, replacing the input pin and the not gate with vcc;
+                     * if the input pin of not gate connects to vcc, replacing the input pin and the not gate with gnd.*/
+                    const int index = i * sizeb + j - offset;
+                    /* connecting untouched nets in the netlist creation to the pad node */
+                    if (not_node[index]->input_pins[0]->net->num_driver_pins == 0) {
+                        /* join untouched net with pad net */
+                        join_nets(netlist->pad_net, not_node[index]->input_pins[0]->net);
+                    }
+                    oassert(not_node[index]->input_pins[0]->net->num_driver_pins == 1);
+                    if (not_node[index]->input_pins[0]->net->driver_pins[0]->node->type == GND_NODE) {
+                        connect_nodes(netlist->vcc_node, 0, node[i], (sizea + j));
+                        remove_fanout_pins_from_net(not_node[index]->input_pins[0]->net, not_node[index]->input_pins[0],
+                                                    not_node[index]->input_pins[0]->pin_net_idx);
+                        free_nnode(not_node[index]);
+                    } else if (not_node[index]->input_pins[0]->net->driver_pins[0]->node->type == VCC_NODE) {
+                        connect_nodes(netlist->gnd_node, 0, node[i], (sizea + j));
+                        remove_fanout_pins_from_net(not_node[index]->input_pins[0]->net, not_node[index]->input_pins[0],
+                                                    not_node[index]->input_pins[0]->pin_net_idx);
+                        free_nnode(not_node[index]);
+                    } else
+                        connect_nodes(not_node[index], 0, node[i], (sizea + j));
+                }
+            }
+        }
+    }
+
+    if ((flag == 0 || count > 1) && !configuration.adder_cin_global) {
+        // connect the a[0] of first adder node to ground, and b[0] of first adder node to vcc
+        connect_nodes(netlist->gnd_node, 0, node[0], 0);
+        connect_nodes(netlist->vcc_node, 0, node[0], sizea);
+        // hang the first sumout
+        node[0]->output_pins[1] = allocate_npin();
+        node[0]->output_pins[1]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, 1);
+    }
+
+    // connect the first cin pin to vcc or unconn depending on configuration
+    if ((flag == 1 && count == 1) || configuration.adder_cin_global)
+        connect_nodes(netlist->vcc_node, 0, node[0], node[0]->num_input_pins - 1);
+    else
+        connect_nodes(netlist->pad_node, 0, node[0], node[0]->num_input_pins - 1);
+
+    // for normal subtraction: if any input pins beside intial cin is NULL, it should connect to unconn
+    // for unary subtraction: the first number should has the number of a input pins connected to gnd. The others are as same as normal subtraction
+    for (int i = 0; i < count; i++) {
+        num = node[i]->num_input_pins;
+        for (int j = 0; j < num - 1; j++) {
+            if (node[i]->input_pins[j] == NULL) {
+                if (nodeo->num_input_port_sizes != 3 && i * sizea + j < a)
+                    connect_nodes(netlist->gnd_node, 0, node[i], j);
+                else
+                    connect_nodes(netlist->pad_node, 0, node[i], j);
+            }
+        }
+    }
+
+    // connect cout to next node's cin
+    for (int i = 1; i < count; i++)
+        connect_nodes(node[i - 1], 0, node[i], (node[i]->num_input_pins - 1));
+
+    if (flag == 1 && count == 1) {
+        for (int j = 0; j < node[0]->num_output_pins - 1; j++) {
+            if (j < nodeo->num_output_pins)
+                remap_pin_to_new_node(nodeo->output_pins[j], node[0], j + 1);
+            else {
+                node[0]->output_pins[j + 1] = allocate_npin();
+                // Pad outputs with a unique and descriptive name to avoid collisions.
+                node[0]->output_pins[j + 1]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, j + 1);
+            }
+        }
+    } else {
+        for (int j = 0; j < node[0]->num_output_pins - 2; j++) {
+            if (j < nodeo->num_output_pins)
+                remap_pin_to_new_node(nodeo->output_pins[j], node[0], j + 2);
+            else {
+                node[0]->output_pins[j + 2] = allocate_npin();
+                // Pad outputs with a unique and descriptive name to avoid collisions.
+                node[0]->output_pins[j + 2]->name = append_string("", "%s~dummy_output~%d~%d", node[0]->name, 0, j + 2);
+            }
+        }
+    }
+
+    if (count > 1 || configuration.adder_cin_global) {
+        // remap the output pins of each adder to nodeo
+        for (int i = offset; i < count; i++) {
+            for (int j = 0; j < node[i]->num_output_pins - 1; j++) {
+                if ((i * sizea + j - offset) < nodeo->num_output_pins)
+                    remap_pin_to_new_node(nodeo->output_pins[i * sizea + j - offset], node[i], j + 1);
+                else {
+                    node[i]->output_pins[j + 1] = allocate_npin();
+                    // Pad outputs with a unique and descriptive name to avoid collisions.
+                    node[i]->output_pins[j + 1]->name = append_string("", "%s~dummy_output~%d~%d", node[i]->name, i, j + 2);
+                }
+            }
+        }
+    }
+    node[count - 1]->output_pins[0] = allocate_npin();
+    // Pad outputs with a unique and descriptive name to avoid collisions.
+    node[count - 1]->output_pins[0]->name = append_string("", "%s~dummy_output~%d~%d", node[(count - 1)]->name, (count - 1), 0);
+    // connect_nodes(node[count - 1], (node[(count - 1)]->num_output_pins - 1), netlist->gnd_node, 0);
+    // }
+
+    /* Freeing the old node! */
+    cleanup_sub_old_node(nodeo, netlist);
+
+    vtr::free(node);
+    vtr::free(not_node);
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: iterate_adders_for_sub)
+ *
+ * This function will iterate over all of the minus operations that
+ *	exist in the netlist and perform a splitting so that they can
+ *	fit into a basic hard adder block that exists on the FPGA.
+ *	If the proper option is set, then it will be expanded as well
+ *	to just use a fixed size hard adder.
+ *-----------------------------------------------------------------------*/
+void iterate_adders_for_sub(netlist_t *netlist)
+{
+    int sizea, sizeb, sizecin; // the size of
+    int a, b;
+    int count, counta, countb;
+    int num;
+    nnode_t *node;
+
+    const int offset = (configuration.adder_cin_global) ? 0 : 1;
+
+    /* Can only perform the optimisation if hard adders exist! */
+    if (hard_adders == NULL)
+        return;
+    else {
+        // In hard block adder, the summand and addend are same size.
+        sizecin = hard_adders->inputs->size;
+        sizeb = hard_adders->inputs->next->size;
+        sizea = hard_adders->inputs->next->size;
+
+        oassert(sizecin == 1);
+
+        while (sub_list != NULL) {
+            node = (nnode_t *)sub_list->data_vptr;
+            sub_list = delete_in_vptr_list(sub_list);
+
+            oassert(node != NULL);
+            oassert(node->type == MINUS);
+
+            a = node->input_port_sizes[0];
+            if (node->num_input_port_sizes == 2)
+                b = node->input_port_sizes[1];
+            else
+                b = node->input_port_sizes[0];
+            num = (a >= b) ? a : b;
+
+            if (num >= min_threshold_adder) {
+                // how many subtractors base on a can split
+                if ((a + 1) % sizea == 0)
+                    counta = (a + offset) / sizea;
+                else
+                    counta = (a + 1) / sizea + 1;
+                // how many subtractors base on b can split
+                if ((b + 1) % sizeb == 0)
+                    countb = (b + offset) / sizeb;
+                else
+                    countb = (b + 1) / sizeb + 1;
+                // how many subtractors need to be split
+                if (counta >= countb)
+                    count = counta;
+                else
+                    count = countb;
+                subchaintotal++;
+
+                split_adder_for_sub(node, a, b, sizea, sizeb, 1, 1, count, netlist);
+            }
+            // Store the node into processed_adder_list if the threshold is bigger than num
+            else
+                processed_adder_list = insert_in_vptr_list(processed_adder_list, node);
+        }
+    }
+
+    return;
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: instantiate_sub_w_borrow_block )
+ *
+ * @brief soft logic implemention of single bit subtraction
+ * with borrow_in and borrow_out
+ *
+ * @param node pointing to a logical not node
+ * @param mark unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_sub_w_borrow_block(nnode_t *node, short traverse_mark_number, netlist_t *netlist)
+{
+    /* validate input port sizes */
+    oassert(node->input_port_sizes[0] == 1);
+    oassert(node->input_port_sizes[1] == 1);
+    /* validate output port sizes */
+    oassert(node->num_output_port_sizes == 2);
+    oassert(node->output_port_sizes[0] == 1);
+    oassert(node->output_port_sizes[1] == 1);
+
+    /*                                                                                                         *
+     * <SUB INTERNAL DESIGN>                                                                                   *
+     *                                                                                                         *
+     *       IN1 ----- \\‾‾``                                                                                  *
+     *           |      ||   ``                                                                                *
+     *           |      ||   '' --------------------------------------------------  \\‾‾``                     *
+     *           |      ||   ,,                |                                     ||   ``                   *
+     * IN2 ------0----  //__,,                 |                                     ||   '' ----- DIFF        *
+     *     |     |   (first_xor)               |                                     ||   ,,                   *
+     *     |     |                             |                       -----------  //__,,                     *
+     *     |     |                             |                       |          (second_xor)                 *
+     *     |     |                             |                       |                                       *
+     *     |     |                             |                       |                                       *
+     *     |     |                             |                       |                                       *
+     *     |     |      BIN -------------------0-----------------------|                                       *
+     *     |     |                             |   |                                                           *
+     *     |     |                             |   |                                                           *
+     *     |     |                             |   |     ___                                                   *
+     *     |     |                             |   |____|   ⎞                                                  *
+     *     |     |                             |        |    ⎞                                                 *
+     *     |     |                             |        |     )----------------                                *
+     *     |     |                             |__|\____|    ⎠                |            ____                *
+     *     |     |         ___                    |/    |___⎠                 |-----------⎞    \               *
+     *     |     |__|\____|   ⎞                     (first_xor_not)                        ⎞    ⎞              *
+     *     |        |/    |    ⎞                                                            |    )--- BOUT     *
+     *     |     (IN_not) |     )--------------------------------------------------------- ⎠    ⎠              *
+     *     |______________|    ⎠                                                          ⎠____/               *
+     *                    |___⎠                                                         (first_or)             *
+     *                  (first_and)                                                                            *
+     *                                                                                                         *
+     */
+
+    /**
+     * SUB ports:
+     *
+     * IN1:  1 bit input_port[0]
+     * IN2:  1 bit input_port[1]
+     * BIN:  1 bit input_port[2]
+     *
+     * DIFF: 1 bit output_port[0]
+     * BOUT: 1 bit output_port[1]
+     */
+
+    npin_t *IN1 = node->input_pins[0];
+    npin_t *IN2 = node->input_pins[1];
+    npin_t *BIN = (node->num_input_port_sizes == 3) ? node->input_pins[2] : NULL;
+
+    npin_t *BOUT = (node->num_output_port_sizes == 2) ? node->output_pins[1] : NULL;
+    npin_t *DIFF = node->output_pins[0];
+
+    /*******************************************************************************
+     ********************************** DIFFERENCE *********************************
+     *******************************************************************************/
+    /* creating the first xor */
+    nnode_t *xor1 = make_2port_gate(LOGICAL_XOR, 1, 1, 1, node, traverse_mark_number);
+    /* remapping IN1 as the first input to XOR */
+    remap_pin_to_new_node(IN1, xor1, 0);
+    /* remapping IN2 as the second input to XOR */
+    remap_pin_to_new_node(IN2, xor1, 1);
+    /* create the first xor output pin */
+    signal_list_t *xor1_outs = make_output_pins_for_existing_node(xor1, 1);
+    npin_t *xor1_out = xor1_outs->pins[0]->net->fanout_pins[0];
+
+    /* creating the second xor */
+    nnode_t *xor2 = make_2port_gate(LOGICAL_XOR, 1, 1, 1, node, traverse_mark_number);
+    /* remapping xor1 output as the first input to XOR */
+    add_input_pin_to_node(xor2, xor1_out, 0);
+    /* remapping BIN as the second input to XOR */
+    if (BIN == NULL) {
+        add_input_pin_to_node(xor2, get_zero_pin(netlist), 1);
+    } else {
+        remap_pin_to_new_node(BIN, xor2, 1);
+    }
+    /* need to remap the DIFF as second xor output pin */
+    remap_pin_to_new_node(DIFF, xor2, 0);
+
+    /*******************************************************************************
+     ************************************* BORROW **********************************
+     *******************************************************************************/
+    /* creating not IN1 */
+    nnode_t *IN1_not = make_inverter(copy_input_npin(IN1), node, traverse_mark_number);
+    npin_t *IN1_not_out = IN1_not->output_pins[0]->net->fanout_pins[0];
+
+    /* creating the first and */
+    nnode_t *and1 = make_2port_gate(LOGICAL_AND, 1, 1, 1, node, traverse_mark_number);
+    /* remapping IN1 as the first input to XOR */
+    add_input_pin_to_node(and1, IN1_not_out, 0);
+    /* remapping IN2 as the second input to XOR */
+    add_input_pin_to_node(and1, IN2, 1);
+    /* create the first and output pin */
+    signal_list_t *and1_outs = make_output_pins_for_existing_node(and1, 1);
+    npin_t *and1_out = and1_outs->pins[0]->net->fanout_pins[0];
+
+    /* creating not first_xor */
+    nnode_t *xor1_not = make_inverter(copy_input_npin(xor1_out), xor1, traverse_mark_number);
+    npin_t *xor1_not_out = xor1_not->output_pins[0]->net->fanout_pins[0];
+
+    /* creating the second_and */
+    nnode_t *and2 = make_2port_gate(LOGICAL_AND, 1, 1, 1, node, traverse_mark_number);
+    /* remapping IN1 as the first input to XOR */
+    add_input_pin_to_node(and2, copy_input_npin(BIN), 0);
+    /* remapping IN2 as the second input to XOR */
+    add_input_pin_to_node(and2, xor1_not_out, 1);
+    /* create the second_and output pin */
+    signal_list_t *and2_outs = make_output_pins_for_existing_node(xor1_not, 1);
+    npin_t *and2_out = and2_outs->pins[0]->net->fanout_pins[0];
+
+    /* creating the first_or */
+    nnode_t *or1 = make_2port_gate(LOGICAL_AND, 1, 1, 1, node, traverse_mark_number);
+    /* remapping IN1 as the first input to XOR */
+    add_input_pin_to_node(or1, and2_out, 0);
+    /* remapping IN2 as the second input to XOR */
+    add_input_pin_to_node(or1, and1_out, 1);
+    if (BOUT == NULL) {
+        /* create the first_or output pin */
+        npin_t *or1_out1 = allocate_npin();
+        npin_t *or1_out2 = allocate_npin();
+        nnet_t *or1_net = allocate_nnet();
+        or1_net->name = make_full_ref_name(NULL, NULL, NULL, or1->name, 0);
+        /* hook the output pin into the node */
+        add_output_pin_to_node(or1, or1_out1, 0);
+        /* hook up new pin 1 into the new net */
+        add_driver_pin_to_net(or1_net, or1_out1);
+        /* hook up the new pin 2 to this new net */
+        add_fanout_pin_to_net(or1_net, or1_out2);
+    } else {
+        remap_pin_to_new_node(BOUT, or1, 0);
+    }
+
+    // CLEAN UP
+    free_signal_list(xor1_outs);
+    free_signal_list(and1_outs);
+    free_signal_list(and2_outs);
+    free_nnode(node);
+}
+
+/*-------------------------------------------------------------------------
+ * (function: clean_adders)
+ *
+ * Clean up the memory by deleting the list structure of adders
+ *	during optimization
+ *-----------------------------------------------------------------------*/
+void clean_adders_for_sub()
+{
+    while (sub_list != NULL)
+        sub_list = delete_in_vptr_list(sub_list);
+    while (processed_adder_list != NULL)
+        processed_adder_list = delete_in_vptr_list(processed_adder_list);
+    return;
+}
+
+/**
+ * -------------------------------------------------------------------------
+ * (function: cleanup_sub_old_node)
+ *
+ * @brief <clean up nodeo, a high level MINUS node>
+ * In split_adder_for_sub function, nodeo is splitted to small adders/subtractors,
+ * while because of the complexity of input pin connections they have not been
+ * remapped to new nodes, they just copied and added to new nodes. This function
+ * will detach input pins from the nodeo. Moreover, it will connect the net of
+ * unconnected output signals to the GND node, detach the pin from nodeo and
+ * free the output pins to avoid memory leak.
+ *
+ * @param nodeo representing the old subtraction node
+ * @param netlist representing the current netlist
+ *-----------------------------------------------------------------------*/
+static void cleanup_sub_old_node(nnode_t *nodeo, netlist_t *netlist)
+{
+    /* Disconnecting input pins from the old node side */
+    for (int i = 0; i < nodeo->num_input_pins; i++) {
+        npin_t *input_pin = nodeo->input_pins[i];
+        if (input_pin->node == nodeo)
+            delete_npin(input_pin);
+
+        nodeo->input_pins[i] = NULL;
+    }
+
+    /* connecting the extra output pins to the gnd node */
+    for (int i = 0; i < nodeo->num_output_pins; i++) {
+        npin_t *output_pin = nodeo->output_pins[i];
+
+        if (output_pin && output_pin->node) {
+            /* for now we just pass the signals directly through */
+            npin_t *zero_pin = get_zero_pin(netlist);
+            int idx_2_buffer = zero_pin->pin_net_idx;
+
+            // Dont eliminate the buffer if there are multiple drivers or the AST included it
+            if (output_pin->net->num_driver_pins <= 1) {
+                /* join all fanouts of the output net with the input pins net */
+                join_nets(zero_pin->net, output_pin->net);
+
+                /* erase the pointer to this buffer */
+                zero_pin->net->fanout_pins[idx_2_buffer] = NULL;
+            }
+
+            free_npin(zero_pin);
+            free_npin(output_pin);
+
+            /* Disconnecting output pins from the old node side */
+            nodeo->output_pins[i] = NULL;
+        }
+    }
+
+    // CLEAN UP
+    free_nnode(nodeo);
+}
diff --git a/parmys-plugin/core/subtractor.h b/parmys-plugin/core/subtractor.h
new file mode 100644
index 000000000..281e659ca
--- /dev/null
+++ b/parmys-plugin/core/subtractor.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _SUBTRACTOR_H_
+#define _SUBTRACTOR_H_
+
+#include "adder.h"
+#include "read_xml_arch_file.h"
+
+extern vtr::t_linked_vptr *sub_list;
+extern vtr::t_linked_vptr *sub_chain_list;
+
+extern void report_sub_distribution();
+extern void declare_hard_adder_for_sub(nnode_t *node);
+extern void instantiate_hard_adder_subtraction(nnode_t *node, short mark, netlist_t *netlist);
+extern void split_adder_for_sub(nnode_t *node, int a, int b, int sizea, int sizeb, int cin, int cout, int count, netlist_t *netlist);
+extern void iterate_adders_for_sub(netlist_t *netlist);
+extern void instantiate_sub_w_borrow_block(nnode_t *node, short traverse_mark_number, netlist_t *netlist);
+extern void clean_adders_for_sub();
+
+#endif // _SUBTRACTOR_H_
diff --git a/parmys-plugin/mapping/hard_soft_logic_mixer.cc b/parmys-plugin/mapping/hard_soft_logic_mixer.cc
new file mode 100644
index 000000000..3ac8571f9
--- /dev/null
+++ b/parmys-plugin/mapping/hard_soft_logic_mixer.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "hard_soft_logic_mixer.h"
+
+#include <stdint.h> // INT_MAX
+#include <vector>
+
+#include "multiplier.h" // instantiate_simple_soft_multiplier
+#include "odin_error.h" // error_message
+
+HardSoftLogicMixer::HardSoftLogicMixer()
+{
+    for (int i = 0; i < operation_list_END; i++) {
+        if (i == MULTIPLY) {
+            this->_opts[i] = new MultsOpt();
+        } else {
+            this->_opts[i] = new MixingOpt();
+        }
+    }
+}
+
+HardSoftLogicMixer::~HardSoftLogicMixer()
+{
+    for (int i = 0; i < operation_list_END; i++) {
+        delete this->_opts[i];
+    }
+}
+void HardSoftLogicMixer::note_candidate_node(nnode_t *opNode) { _nodes_by_opt[opNode->type].push_back(opNode); }
+
+bool HardSoftLogicMixer::hardenable(nnode_t *node) { return this->_opts[node->type]->hardenable(node); }
+
+bool HardSoftLogicMixer::enabled(nnode_t *node) { return this->_opts[node->type]->enabled(); }
+
+int HardSoftLogicMixer::hard_blocks_needed(operation_list opt) { return _nodes_by_opt[opt].size(); }
+
+void HardSoftLogicMixer::partial_map_node(nnode_t *node, short traverse_number, netlist_t *netlist)
+{
+    _opts[node->type]->partial_map_node(node, traverse_number, netlist, this);
+}
+
+void HardSoftLogicMixer::perform_optimizations(netlist_t *netlist)
+{
+    if (_opts[MULTIPLY]->enabled()) {
+        int blocks_needed = this->hard_blocks_needed(MULTIPLY);
+        _opts[MULTIPLY]->set_blocks_needed(blocks_needed);
+        _opts[MULTIPLY]->assign_weights(netlist, _nodes_by_opt[MULTIPLY]);
+        _opts[MULTIPLY]->perform(netlist, _nodes_by_opt[MULTIPLY]);
+        _opts[MULTIPLY]->instantiate_soft_logic(netlist, _nodes_by_opt[MULTIPLY]);
+    }
+}
diff --git a/parmys-plugin/mapping/hard_soft_logic_mixer.h b/parmys-plugin/mapping/hard_soft_logic_mixer.h
new file mode 100644
index 000000000..86c58da0e
--- /dev/null
+++ b/parmys-plugin/mapping/hard_soft_logic_mixer.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _HARD_SOFT_LOGIC_MIXER_HPP_
+#define _HARD_SOFT_LOGIC_MIXER_HPP_
+
+#include "mixing_optimization.h"
+#include "odin_types.h"
+
+class HardSoftLogicMixer
+{
+  public:
+    HardSoftLogicMixer();
+    ~HardSoftLogicMixer();
+    /*----------------------------------------------------------------------
+     * Returns whether the specific node is a candidate for implementing
+     * in hard block
+     *---------------------------------------------------------------------
+     */
+    bool hardenable(nnode_t *node);
+
+    /*----------------------------------------------------------------------
+     * Function: map_deferred_blocksQueries if mixing optimization is enabled for this kind of
+     * of hard blocks
+     *---------------------------------------------------------------------
+     */
+    bool enabled(nnode_t *node);
+
+    /*----------------------------------------------------------------------
+     * Function: perform_optimizations
+     * For all  noted nodes, that were noted as candidates to be implemented
+     * on the hard blocks, launches corresponding procedure of chosing the
+     * corresponding blocks
+     * Parameters: netlist_t *
+     *---------------------------------------------------------------------
+     */
+    void perform_optimizations(netlist_t *netlist);
+
+    /*----------------------------------------------------------------------
+     * Function: partial_map_node
+     * High-level call to provide support for partial mapping layer
+     * Parameters:
+     *      node_t * : pointer to node needs to perform mapping
+     *      netlist_t : pointer to netlist
+     *---------------------------------------------------------------------
+     */
+    void partial_map_node(nnode_t *node, short traverse_number, netlist_t *);
+
+    /*----------------------------------------------------------------------
+     * Function: note_candidate_node
+     * Calculates number of available hard blocks by issuing a call,
+     * traverses the netlist and statistics to figure out
+     * which operation should be implemented on the hard block
+     * Parameters:
+     *      node_t * : pointer to candidate node
+     *---------------------------------------------------------------------
+     */
+    void note_candidate_node(nnode_t *node);
+
+    // This is a container containing all optimization passes
+    MixingOpt *_opts[operation_list_END];
+
+  private:
+    /*----------------------------------------------------------------------
+     * Function: hard_blocks_needed
+     * Returns cached value calculated from netlist, for a specific optimiza
+     * tion kind
+     *---------------------------------------------------------------------
+     */
+    int hard_blocks_needed(operation_list);
+
+    // This array is composed of vectors, that store nodes that
+    // are potential candidates for performing mixing optimization
+    std::vector<nnode_t *> _nodes_by_opt[operation_list_END];
+};
+
+#endif
diff --git a/parmys-plugin/mapping/mixing_optimization.cc b/parmys-plugin/mapping/mixing_optimization.cc
new file mode 100644
index 000000000..5ddd8adbb
--- /dev/null
+++ b/parmys-plugin/mapping/mixing_optimization.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "mixing_optimization.h"
+
+#include <stdint.h> // INT_MAX
+#include <vector>
+
+#include "adder.h"                 // hard_adders
+#include "hard_soft_logic_mixer.h" // HardSoftLogicMixer
+#include "multiplier.h"            // instantiate_simple_soft_multiplier
+#include "netlist_statistic.h"     // mixing_optimization_stats
+#include "odin_error.h"            // error_message
+
+void MixingOpt::scale_counts()
+{
+    if (this->_blocks_count < 0 || this->_blocks_count == INT_MAX || this->_ratio < 0.0 || this->_ratio > 1.0) {
+        error_message(NETLIST, unknown_location, "The parameters for optimization kind:%i are configured incorrectly : count %i, ratio %f\n",
+                      this->_kind, this->_blocks_count, this->_ratio);
+        exit(0);
+    }
+    this->_blocks_count = this->_blocks_count * this->_ratio;
+}
+
+void MixingOpt::assign_weights(netlist_t * /*netlist*/, std::vector<nnode_t *> /*nodes*/)
+{
+    // compute weights for all noted nodes
+    error_message(NETLIST, unknown_location,
+                  "Assign_weights mixing optimization was called for optimization without specification provided, for kind  %i\n", this->_kind);
+    exit(0);
+}
+
+void MixingOpt::perform(netlist_t *, std::vector<nnode_t *> &)
+{
+    error_message(NETLIST, unknown_location, "Performing mixing optimization was called for optimization without method provided, for kind  %i\n",
+                  this->_kind);
+    exit(0);
+}
+
+MultsOpt::MultsOpt(int _exact) : MixingOpt(1.0, MULTIPLY)
+{
+    this->_blocks_count = _exact;
+    this->_enabled = true;
+}
+
+MultsOpt::MultsOpt(float ratio) : MixingOpt(ratio, MULTIPLY)
+{
+    if (ratio < 0.0 || ratio > 1.0) {
+        error_message(NETLIST, unknown_location, "Miltipliers mixing optimization is started with wrong ratio %f\n", ratio);
+        exit(0);
+    }
+
+    // Explicitly set all hard block multipliers to max
+    this->_blocks_count = INT_MAX;
+    this->_enabled = true;
+}
+
+bool MultsOpt::hardenable(nnode_t *node)
+{
+    int mult_size = std::max<int>(node->input_port_sizes[0], node->input_port_sizes[1]);
+    return (hard_multipliers && (mult_size > min_mult));
+}
+
+void MultsOpt::assign_weights(netlist_t *netlist, std::vector<nnode_t *> nodes)
+{
+    // compute weights for all noted nodes
+    for (size_t i = 0; i < nodes.size(); i++) {
+        mixing_optimization_stats(nodes[i], netlist);
+    }
+}
+
+void MultsOpt::perform(netlist_t *netlist, std::vector<nnode_t *> &weighted_nodes)
+{
+    size_t nodes_count = weighted_nodes.size();
+
+    // per optimization, instantiate hard logic
+    for (int i = 0; i < this->_blocks_count; i++) {
+        int maximal_cost = -1;
+        int index = -1;
+        for (size_t j = 0; j < nodes_count; j++) {
+            // if found a new maximal cost that is higher than a current maximum AND is not restricted by input
+            // params for minimal "hardenable" multiplier width
+            if (maximal_cost < weighted_nodes[j]->weight && this->hardenable(weighted_nodes[j])) {
+                maximal_cost = weighted_nodes[j]->weight;
+                index = j;
+            }
+        }
+
+        // if there are no suitable nodes left, leave the loop to
+        // implement remaining nodes in soft logic
+        if (index < 0)
+            break;
+
+        // indicate for future iterations the node was hardened
+        weighted_nodes[index]->weight = -1;
+
+        if (hard_multipliers) {
+            instantiate_hard_multiplier(weighted_nodes[index], this->cached_traverse_value, netlist);
+        }
+    }
+
+    // From the end of the vector, remove all nodes that were implemented in hard logic. The remaining
+    // nodes will be instantiated in soft_map_remaining_nodes
+    for (int i = nodes_count - 1; i >= 0; i--) {
+        if (weighted_nodes[i]->weight == -1) {
+            weighted_nodes.erase(weighted_nodes.begin() + i);
+        }
+    }
+}
+
+void MixingOpt::set_blocks_needed(int new_count) { this->_blocks_count = new_count; }
+
+void MultsOpt::set_blocks_needed(int new_count)
+{
+    // with development for fixed_layout, this value will change
+    int availableHardBlocks = INT_MAX;
+    int hardBlocksNeeded = new_count;
+    int hardBlocksCount = availableHardBlocks;
+
+    if (hardBlocksCount > hardBlocksNeeded) {
+        hardBlocksCount = hardBlocksNeeded;
+    }
+
+    if (hardBlocksCount < this->_blocks_count) {
+        this->_blocks_count = hardBlocksCount;
+    }
+
+    this->scale_counts();
+}
+void MixingOpt::instantiate_soft_logic(netlist_t * /*netlist*/, std::vector<nnode_t *> /* nodes*/)
+{
+    error_message(NETLIST, unknown_location, "Performing instantiate_soft_logic was called for optimization without method provided, for kind  %i\n",
+                  this->_kind);
+    exit(0);
+}
+
+void MixingOpt::partial_map_node(nnode_t * /*node*/, short /*traverse_value*/, netlist_t *, /*netlist*/ HardSoftLogicMixer * /*mixer*/)
+{
+    error_message(NETLIST, unknown_location, "Performing partial_map_node was called for optimization without method provided, for kind  %i\n",
+                  this->_kind);
+    exit(0);
+}
+
+void MultsOpt::partial_map_node(nnode_t *node, short traverse_value, netlist_t *netlist, HardSoftLogicMixer *mixer)
+{
+    if (mixer->enabled(node) && mixer->hardenable(node)) {
+        mixer->note_candidate_node(node);
+    } else if (mixer->hardenable(node)) {
+        instantiate_hard_multiplier(node, traverse_value, netlist);
+    } else if (!hard_adders) {
+        instantiate_simple_soft_multiplier(node, traverse_value, netlist);
+    }
+    this->cached_traverse_value = traverse_value;
+}
+
+void MultsOpt::instantiate_soft_logic(netlist_t *netlist, std::vector<nnode_t *> nodes)
+{
+    unsigned int size = nodes.size();
+    for (unsigned int j = 0; j < size; j++) {
+        instantiate_simple_soft_multiplier(nodes[j], this->cached_traverse_value, netlist);
+    }
+    for (int i = size - 1; i >= 0; i--) {
+        nodes[i] = free_nnode(nodes[i]);
+        nodes.erase(nodes.begin() + i);
+    }
+}
diff --git a/parmys-plugin/mapping/mixing_optimization.h b/parmys-plugin/mapping/mixing_optimization.h
new file mode 100644
index 000000000..3aa71fb7b
--- /dev/null
+++ b/parmys-plugin/mapping/mixing_optimization.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _MIXING_OPTIMIZATION_H_
+#define _MIXING_OPTIMIZATION_H_
+
+#include "odin_types.h"
+
+class HardSoftLogicMixer;
+/**
+ * @brief A base class in hierarchy for complex synthesis
+ * allowing for mixing soft and hard logic
+ */
+class MixingOpt
+{
+  public:
+    /**
+     * @brief Construct a new Mixing Opt object for disabled optimization
+     * usable for querying 'hardenable' condition
+     */
+    MixingOpt() { _enabled = false; }
+
+    /**
+     * @brief Construct a new Mixing Opt object
+     *
+     * By default, all optimizations only share
+     * the ratio of blocks to be implemented in
+     * hard logic
+     * @param ratio, a value within 0 to 1 to
+     * implement ratio*requested hard blocks in
+     * hard logic
+     * @param kind a kind of blocks that correspond
+     * to optimization pass
+     */
+    MixingOpt(float ratio, operation_list kind) : _kind(kind) { _ratio = ratio; }
+
+    /**
+     * @brief Destroy the Mixing Opt object
+     * required by compiler
+     */
+    virtual ~MixingOpt() = default;
+
+    /**
+     * @brief assign weights to the candidate nodes vector, according to netlist_statistic
+     *
+     * @param nnode_t* pointer to the node
+     */
+    virtual void assign_weights(netlist_t *netlist, std::vector<nnode_t *> nodes);
+
+    /**
+     * @brief Checks if the optimization is enabled for this node
+     *
+     * @param nodes pointer to the vector with mults
+     */
+    virtual bool enabled() { return _enabled; }
+
+    /**
+     * @brief Instantiates an alternative (not on hard blocks)
+     * implementation for the operation
+     *
+     * @param netlist
+     * @param nodes
+     */
+    virtual void instantiate_soft_logic(netlist_t *netlist, std::vector<nnode_t *> nodes);
+
+    /**
+     * @brief performs the optimization pass, varies between kinds.
+     * If the implementation is not provided within the inherited class
+     * will throw ODIN error
+     *
+     * @param netlist_t* pointer to a global netlist
+     * @param std::vector<nnode_t*> a vector with nodes the optimization
+     * pass is concerned (all of which are potential candidates to
+     * be implemented in hard blocks for a given _kind)
+     */
+    virtual void perform(netlist_t *, std::vector<nnode_t *> &);
+
+    /**
+     * @brief Set the blocks of blocks required
+     * by counting in netlist
+     *
+     * @param count
+     */
+    virtual void set_blocks_needed(int count);
+
+    operation_list get_kind() { return _kind; }
+
+    /**
+     * @brief based on criteria for hardening given kind of operation, return
+     * if the node should be implemented in hard blocks
+     *
+     * @param nnode_t* pointer to the node
+     */
+    virtual bool hardenable(nnode_t *) { return false; }
+
+    /**
+     * @brief allowing for replacing with dynamic polymorphism for different
+     * kinds of nodes
+     *
+     * @param nnode_t* pointer to the node
+     */
+    virtual void partial_map_node(nnode_t *, short, netlist_t *, HardSoftLogicMixer *);
+
+  protected:
+    /**
+     * @brief a routine that will multiply
+     * required blocks by the ratio
+     */
+    virtual void scale_counts();
+
+    /**
+     * @brief this variable allows to cache traverse value
+     *
+     */
+    short cached_traverse_value = 0;
+
+    // an integer representing the number of required hard blocks
+    // that should be estimated and updated through set blocks needed
+    int _blocks_count = -1;
+    // a boolean type to double check if the optimization is enabled
+    bool _enabled = false;
+    // a parameter allowing for scaling counts
+    float _ratio = -1.0;
+    // an enum kind variable, corresponding to an optimization pass
+    operation_list _kind = operation_list_END;
+};
+
+class MultsOpt : public MixingOpt
+{
+  public:
+    /**
+     * @brief Construct a new Mults Opt object for disabled optimization
+     * usable for querying 'hardenable' condition
+     */
+    MultsOpt() : MixingOpt() {}
+
+    /**
+     * @brief Construct a new Mults Opt object
+     * from ratio parameter
+     * @param ratio
+     */
+    MultsOpt(float ratio);
+    /**
+     * @brief Construct a new Mults Opt object
+     * allowing to set exact number of multipliers
+     * that will be used
+     * @param exact
+     */
+    MultsOpt(int exact);
+
+    /**
+     * @brief assign weights to the candidate nodes vector, according to netlist_statistic
+     *
+     * @param nodes pointer to the vector with mults
+     */
+    virtual void assign_weights(netlist_t *netlist, std::vector<nnode_t *> nodes);
+
+    /**
+     * @brief allowing for replacing with dynamic polymorphism for different
+     * kinds of nodes
+     *
+     * @param nnode_t* pointer to the node
+     */
+    virtual void partial_map_node(nnode_t *, short, netlist_t *, HardSoftLogicMixer *);
+    /**
+     * @brief Instantiates an alternative (not on hard blocks)
+     * implementation for the operation
+     *
+     * @param netlist
+     * @param nodes
+     */
+    virtual void instantiate_soft_logic(netlist_t *netlist, std::vector<nnode_t *> nodes);
+
+    /**
+     * @brief performs the optimization pass, specifically for multipliers.
+     * If the implementation is not provided within the inherited class
+     * will throw ODIN error
+     *
+     * @param netlist_t* pointer to a global netlist
+     * @param std::vector<nnode_t*> a vector with nodes the optimization
+     * pass is concerned (all of which are potential candidates to
+     * be implemented in hard blocks for a given _kind)
+     */
+    virtual void perform(netlist_t *netlist, std::vector<nnode_t *> &);
+
+    /**
+     * @brief Set the blocks of blocks required
+     * by counting in netlist. Has to be overriden, to account
+     * with specifics of optimization
+     *
+     * @param count
+     */
+    virtual void set_blocks_needed(int);
+
+    /**
+     * @brief based on criteria for hardening given kind of operation, return
+     * if the node should be implemented in hard blocks
+     *
+     * @param nnode_t* pointer to the node
+     */
+    virtual bool hardenable(nnode_t *);
+};
+
+#endif // _MIXING_OPTIMIZATION_H_
diff --git a/parmys-plugin/mapping/odin_ii.cc b/parmys-plugin/mapping/odin_ii.cc
new file mode 100644
index 000000000..e53f19b3a
--- /dev/null
+++ b/parmys-plugin/mapping/odin_ii.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <sstream>
+
+#include "odin_ii.h"
+
+#include "odin_globals.h"
+#include "odin_types.h"
+
+#include "hard_soft_logic_mixer.h"
+#include "vtr_path.h"
+
+#define DEFAULT_OUTPUT "."
+
+loc_t my_location;
+
+t_arch Arch;
+global_args_t global_args;
+short physical_lut_size = -1;
+HardSoftLogicMixer *mixer;
+
+/* CONSTANT NET ELEMENTS */
+char *one_string;
+char *zero_string;
+char *pad_string;
+
+/*---------------------------------------------------------------------------
+ * (function: set_default_options)
+ *-------------------------------------------------------------------------*/
+void set_default_config()
+{
+    /* Set up the global configuration. */
+    configuration.coarsen = false;
+    configuration.tcl_file = "";
+    configuration.output_netlist_graphs = 0;
+    // TODO: unused
+    configuration.debug_output_path = std::string(DEFAULT_OUTPUT);
+    configuration.dsp_verilog = "arch_dsp.v";
+    configuration.arch_file = "";
+
+    configuration.fixed_hard_multiplier = 0;
+    configuration.split_hard_multiplier = 0;
+
+    configuration.split_memory_width = 0;
+    configuration.split_memory_depth = 0;
+
+    configuration.adder_cin_global = false;
+
+    /*
+     * Soft logic cutoffs. If a memory or a memory resulting from a split
+     * has a width AND depth below these, it will be converted to soft logic.
+     */
+    configuration.soft_logic_memory_width_threshold = 0;
+    configuration.soft_logic_memory_depth_threshold = 0;
+}
diff --git a/parmys-plugin/mapping/odin_ii.h b/parmys-plugin/mapping/odin_ii.h
new file mode 100644
index 000000000..dec1624dc
--- /dev/null
+++ b/parmys-plugin/mapping/odin_ii.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ODIN_II_H_
+#define _ODIN_II_H_
+
+#include "odin_types.h"
+
+/* Odin-II exit status enumerator */
+enum ODIN_ERROR_CODE { ERROR_INITIALIZATION, ERROR_PARSE_CONFIG, ERROR_PARSE_ARCH, ERROR_ELABORATION, ERROR_OPTIMIZATION, ERROR_TECHMAP };
+
+void set_default_config();
+
+#endif // _ODIN_II_H_
diff --git a/parmys-plugin/mapping/partial_map.cc b/parmys-plugin/mapping/partial_map.cc
new file mode 100644
index 000000000..b1f676a7c
--- /dev/null
+++ b/parmys-plugin/mapping/partial_map.cc
@@ -0,0 +1,1412 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+#include <stdio.h>
+#include <string.h>
+
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "odin_util.h"
+
+#include "adder.h"
+#include "hard_block.h"
+
+#include "memory.h"
+
+#include "partial_map.h"
+#include "subtractor.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+USING_YOSYS_NAMESPACE
+
+void depth_first_traverse_partial_map(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+void partial_map_node(nnode_t *node, short traverse_number, netlist_t *netlist);
+
+void instantiate_not_logic(nnode_t *node, short mark, netlist_t *netlist);
+bool eliminate_buffer(nnode_t *node, short, netlist_t *);
+void instantiate_bitwise_logic(nnode_t *node, operation_list op, short mark, netlist_t *netlist);
+void instantiate_bitwise_reduction(nnode_t *node, operation_list op, short mark);
+void instantiate_logical_logic(nnode_t *node, operation_list op, short mark);
+void instantiate_EQUAL(nnode_t *node, operation_list type, short mark, netlist_t *netlist);
+void instantiate_GE(nnode_t *node, operation_list type, short mark, netlist_t *netlist);
+void instantiate_GT(nnode_t *node, operation_list type, short mark, netlist_t *netlist);
+void instantiate_shift(nnode_t *node, short mark, netlist_t *netlist);
+void instantiate_unary_sub(nnode_t *node, short mark, netlist_t *netlist);
+void instantiate_sub_w_carry(nnode_t *node, short mark, netlist_t *netlist);
+void instantiate_sub_w_borrow(nnode_t *node, short mark, netlist_t *netlist);
+
+void instantiate_soft_logic_ram(nnode_t *node, short mark, netlist_t *netlist);
+
+static void instantiate_constant_shift(nnode_t *node, operation_list type, short mark, netlist_t *netlist);
+static void instantiate_variable_shift(nnode_t *node, operation_list type, short mark, netlist_t *netlist);
+
+/*-------------------------------------------------------------------------
+ * (function: partial_map_top)
+ *-----------------------------------------------------------------------*/
+void partial_map_top(netlist_t *netlist) { depth_first_traversal_to_partial_map(PARTIAL_MAP_TRAVERSE_VALUE, netlist); }
+
+/*---------------------------------------------------------------------------------------------
+ * (function: depth_first_traversal_to_parital_map()
+ *-------------------------------------------------------------------------------------------*/
+void depth_first_traversal_to_partial_map(short marker_value, netlist_t *netlist)
+{
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        if (netlist->top_input_nodes[i] != NULL) {
+            depth_first_traverse_partial_map(netlist->top_input_nodes[i], marker_value, netlist);
+        }
+    }
+
+    depth_first_traverse_partial_map(netlist->gnd_node, marker_value, netlist);
+    depth_first_traverse_partial_map(netlist->vcc_node, marker_value, netlist);
+    depth_first_traverse_partial_map(netlist->pad_node, marker_value, netlist);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: depth_first_traverse)
+ *-------------------------------------------------------------------------------------------*/
+void depth_first_traverse_partial_map(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    if (node->traverse_visited != traverse_mark_number) {
+
+        node->traverse_visited = traverse_mark_number;
+
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->net) {
+                nnet_t *next_net = node->output_pins[i]->net;
+                if (next_net->fanout_pins) {
+                    for (int j = 0; j < next_net->num_fanout_pins; j++) {
+                        if (next_net->fanout_pins[j]) {
+                            if (next_net->fanout_pins[j]->node) {
+                                depth_first_traverse_partial_map(next_net->fanout_pins[j]->node, traverse_mark_number, netlist);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        partial_map_node(node, traverse_mark_number, netlist);
+    }
+}
+
+/*----------------------------------------------------------------------
+ * (function: partial_map_node)
+ *--------------------------------------------------------------------*/
+void partial_map_node(nnode_t *node, short traverse_number, netlist_t *netlist)
+{
+    switch (node->type) {
+    case BITWISE_NOT:
+        instantiate_not_logic(node, traverse_number, netlist);
+        break;
+    case BUF_NODE:
+        eliminate_buffer(node, traverse_number, netlist);
+        break;
+    case BITWISE_AND:
+    case BITWISE_OR:
+    case BITWISE_NAND:
+    case BITWISE_NOR:
+    case BITWISE_XNOR:
+    case BITWISE_XOR:
+        if (node->num_input_port_sizes >= 2) {
+            instantiate_bitwise_logic(node, node->type, traverse_number, netlist);
+        } else if (node->num_input_port_sizes == 1) {
+            instantiate_bitwise_reduction(node, node->type, traverse_number);
+        } else
+            oassert(false);
+        break;
+
+    case LOGICAL_OR:
+    case LOGICAL_AND:
+    case LOGICAL_NOR:
+    case LOGICAL_NAND:
+    case LOGICAL_XOR:
+    case LOGICAL_XNOR:
+        if (node->num_input_port_sizes == 2) {
+            instantiate_logical_logic(node, node->type, traverse_number);
+        }
+        break;
+
+    case LOGICAL_NOT:
+        /* don't need to do anything since this is a reduction */
+        break;
+
+    case ADD:
+        if (hard_adders && node->bit_width >= min_threshold_adder) {
+            // Check if the size of this adder is greater than the hard vs soft logic threshold
+            instantiate_hard_adder(node, traverse_number, netlist);
+        } else {
+            instantiate_add_w_carry(node, traverse_number, netlist);
+        }
+        break;
+    case MINUS:
+        if (hard_adders) {
+            if (node->num_input_port_sizes == 3) {
+                int max_num = (node->input_port_sizes[0] >= node->input_port_sizes[1]) ? node->input_port_sizes[0] : node->input_port_sizes[1];
+                if (max_num >= min_add)
+                    instantiate_hard_adder_subtraction(node, traverse_number, netlist);
+                else
+                    instantiate_add_w_carry(node, traverse_number, netlist);
+            } else if (node->num_input_port_sizes == 2) {
+                instantiate_sub_w_carry(node, traverse_number, netlist);
+            } else if (node->num_input_port_sizes == 1) {
+                instantiate_unary_sub(node, traverse_number, netlist);
+            } else
+                oassert(false);
+        } else {
+            if (node->num_input_port_sizes == 3) {
+                instantiate_sub_w_borrow(node, traverse_number, netlist);
+            } else if (node->num_input_port_sizes == 2) {
+                instantiate_sub_w_carry(node, traverse_number, netlist);
+            } else if (node->num_input_port_sizes == 1) {
+                instantiate_unary_sub(node, traverse_number, netlist);
+            } else
+                oassert(false);
+        }
+
+        break;
+    case LOGICAL_EQUAL:
+    case NOT_EQUAL:
+        instantiate_EQUAL(node, node->type, traverse_number, netlist);
+        break;
+    case GTE:
+    case LTE:
+        instantiate_GE(node, node->type, traverse_number, netlist);
+        break;
+    case GT:
+    case LT:
+        instantiate_GT(node, node->type, traverse_number, netlist);
+        break;
+    case SL:
+    case ASL:
+    case SR:
+    case ASR:
+        instantiate_shift(node, traverse_number, netlist);
+        break;
+    case MULTI_PORT_MUX:
+        instantiate_multi_port_mux(node, traverse_number, netlist);
+        break;
+    case MULTIPORT_nBIT_SMUX:
+        instantiate_multi_port_n_bits_mux(node, traverse_number, netlist);
+        break;
+    case MULTIPLY: {
+        mixer->partial_map_node(node, traverse_number, netlist);
+        break;
+    }
+    case MEMORY: {
+        ast_node_t *ast_node = node->related_ast_node;
+        char *identifier = ast_node->identifier_node->types.identifier;
+        if (find_hard_block(identifier)) {
+            long depth = is_sp_ram(node) ? get_sp_ram_depth(node) : get_dp_ram_depth(node);
+            long width = is_sp_ram(node) ? get_sp_ram_width(node) : get_dp_ram_width(node);
+
+            // If the memory satisfies the threshold for the use of a hard logic block, use one.
+            if (depth > configuration.soft_logic_memory_depth_threshold || width > configuration.soft_logic_memory_width_threshold) {
+                instantiate_hard_block(node, traverse_number, netlist);
+            } else {
+                log("\tInferring soft logic ram: %zux%zu\n", width, depth);
+                instantiate_soft_logic_ram(node, traverse_number, netlist);
+            }
+        } else {
+            instantiate_soft_logic_ram(node, traverse_number, netlist);
+        }
+        break;
+    }
+    case HARD_IP:
+        instantiate_hard_block(node, traverse_number, netlist);
+
+        break;
+    case ADDER_FUNC:
+    case CARRY_FUNC:
+    case MUX_2:
+    case SMUX_2:
+    case FF_NODE:
+    case INPUT_NODE:
+    case CLOCK_NODE:
+    case OUTPUT_NODE:
+    case GND_NODE:
+    case VCC_NODE:
+    case PAD_NODE:
+        /* some nodes already in the form that is mapable */
+        break;
+    case SKIP: // should skip
+        instantiate_hard_block(node, traverse_number, netlist);
+        break;
+    case CASE_EQUAL:
+    case CASE_NOT_EQUAL:
+    case DIVIDE:
+    case MODULO:
+    // case MULTIPORT_nBIT_SMUX:
+    default:
+        error_message(NETLIST, node->loc, "%s", "Partial map: node should have been converted to softer version.");
+        break;
+    }
+}
+
+void instantiate_soft_logic_ram(nnode_t *node, short mark, netlist_t *netlist)
+{
+    if (is_sp_ram(node))
+        instantiate_soft_single_port_ram(node, mark, netlist);
+    else if (is_dp_ram(node))
+        instantiate_soft_dual_port_ram(node, mark, netlist);
+    else
+        oassert(false);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_multi_port_mux )
+ * 	Makes the multiport into a series of 2-Mux-decoded
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_multi_port_mux(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    int width_of_one_hot_logic;
+    int num_ports;
+    int port_offset;
+    nnode_t **muxes;
+
+    /* setup the calculations for padding and indexing */
+    width_of_one_hot_logic = node->input_port_sizes[0];
+    num_ports = node->num_input_port_sizes;
+    port_offset = node->input_port_sizes[1];
+
+    muxes = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * (num_ports - 1));
+    for (int i = 0; i < num_ports - 1; i++) {
+        muxes[i] = make_2port_gate(MUX_2, width_of_one_hot_logic, width_of_one_hot_logic, 1, node, mark);
+    }
+
+    for (int j = 0; j < num_ports - 1; j++) {
+        for (int i = 0; i < width_of_one_hot_logic; i++) {
+            /* map the inputs to the muxt */
+            remap_pin_to_new_node(node->input_pins[i + (j + 1) * port_offset], muxes[j], width_of_one_hot_logic + i);
+
+            /* map the one hot logic control */
+            if (j == 0)
+                remap_pin_to_new_node(node->input_pins[i], muxes[j], i);
+            else
+                add_input_pin_to_node(muxes[j], copy_input_npin(muxes[0]->input_pins[i]), i);
+        }
+
+        /* now hookup outputs */
+        remap_pin_to_new_node(node->output_pins[j], muxes[j], 0);
+    }
+    vtr::free(muxes);
+    free_nnode(node);
+}
+
+/**
+ * (function: transform_to_single_bit_mux_nodes)
+ *
+ * @brief split the mux node read from yosys blif to
+ * the same type nodes with input/output width one
+ *
+ * @param node pointing to the mux node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+nnode_t **transform_to_single_bit_mux_nodes(nnode_t *node, uintptr_t traverse_mark_number, netlist_t * /* netlist */)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    /* to check all mux inputs have the same width(except [0] which is selector) */
+    for (int i = 2; i < node->num_input_port_sizes; i++) {
+        oassert(node->input_port_sizes[i] == node->input_port_sizes[1]);
+    }
+
+    int selector_width = node->input_port_sizes[0];
+    int num_input_ports = node->num_input_port_sizes;
+    int num_mux_nodes = node->num_output_pins;
+
+    nnode_t **mux_node = (nnode_t **)vtr::calloc(num_mux_nodes, sizeof(nnode_t *));
+
+    /**
+     * input_port[0] -> SEL
+     * input_pin[SEL_WIDTH..n] -> MUX inputs
+     * output_pin[0..n-1] -> MUX outputs
+     */
+    for (int i = 0; i < num_mux_nodes; i++) {
+        mux_node[i] = allocate_nnode(node->loc);
+
+        mux_node[i]->type = node->type;
+        mux_node[i]->traverse_visited = traverse_mark_number;
+
+        /* Name the mux based on the name of its output pin */
+        // const char* mux_base_name = node_name_based_on_op(mux_node[i]);
+        // mux_node[i]->name = (char*)vtr::malloc(sizeof(char) * (strlen(node->output_pins[i]->name) + strlen(mux_base_name) + 2));
+        // odin_sprintf(mux_node[i]->name, "%s_%s", node->output_pins[i]->name, mux_base_name);
+        mux_node[i]->name = node_name(mux_node[i], node->name);
+
+        add_input_port_information(mux_node[i], selector_width);
+        allocate_more_input_pins(mux_node[i], selector_width);
+
+        if (i == num_mux_nodes - 1) {
+            /**
+             * remap the SEL pins from the mux node
+             * to the last splitted mux node
+             */
+            for (int j = 0; j < selector_width; j++) {
+                remap_pin_to_new_node(node->input_pins[j], mux_node[i], j);
+            }
+
+        } else {
+            /* add a copy of SEL pins from the mux node to the splitted mux nodes */
+            for (int j = 0; j < selector_width; j++) {
+                add_input_pin_to_node(mux_node[i], copy_input_npin(node->input_pins[j]), j);
+            }
+        }
+
+        /**
+         * remap the input_pin[i+1]/output_pin[i] from the mux node to the
+         * last splitted ff node since we do not need it in dff node anymore
+         **/
+        int acc_port_sizes = selector_width;
+        for (int j = 1; j < num_input_ports; j++) {
+            add_input_port_information(mux_node[i], 1);
+            allocate_more_input_pins(mux_node[i], 1);
+
+            remap_pin_to_new_node(node->input_pins[i + acc_port_sizes], mux_node[i], selector_width + j - 1);
+            acc_port_sizes += node->input_port_sizes[j];
+        }
+
+        /* output */
+        add_output_port_information(mux_node[i], 1);
+        allocate_more_output_pins(mux_node[i], 1);
+
+        remap_pin_to_new_node(node->output_pins[i], mux_node[i], 0);
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    return mux_node;
+}
+
+/**
+ * (function: instantiate_multi_port_n_bits_mux)
+ *
+ * @brief Makes the multiport n bits multiplexer into
+ * a series of 2-Mux-decoded
+ *
+ * [NOTE]: Selector should be the first port
+ *
+ * @param node pointing to the mux node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void instantiate_multi_port_n_bits_mux(nnode_t *node, short mark, netlist_t *netlist)
+{
+    char *name = vtr::strdup(node->name);
+    int num_single_muxes = node->num_output_pins;
+    /* This split the multiport n bit mux node into multiport 1 bit muxes*/
+    nnode_t **single_bit_muxes = transform_to_single_bit_mux_nodes(node, mark, netlist);
+
+    /* iterating over single bit muxes that has multiple (>2) port to turn them into 2-mux */
+    for (int cnt = 0; cnt < num_single_muxes; cnt++) {
+        nnode_t *single_bit_mux = single_bit_muxes[cnt];
+
+        /* keeping the information of each single bit mux */
+        int num_expressions = single_bit_mux->num_input_port_sizes - 1;
+        int port_offset = single_bit_mux->input_port_sizes[0];
+        int selector_width = single_bit_mux->input_port_sizes[0];
+
+        /* need to reorder to turn to a smux, nothing more */
+        if (selector_width == 1 && num_expressions == 2) {
+            single_bit_mux->type = SMUX_2;
+            if (single_bit_mux->name)
+                vtr::free(single_bit_mux->name);
+
+            single_bit_mux->name = node_name(single_bit_mux, name);
+        } else {
+            nnode_t ***muxes = (nnode_t ***)vtr::calloc(selector_width, sizeof(nnode_t **));
+            /* to keep the internal output signals for future usage */
+            signal_list_t **output_signals = (signal_list_t **)vtr::calloc(selector_width, sizeof(signal_list_t *));
+            /* creating multiple stages to decode single bit mux into 2-mux */
+            for (int i = 0; i < selector_width; i++) {
+                /* num of muxes in each stage */
+                int num_of_muxes = shift_left_value_with_overflow_check(0x1, selector_width - (i + 1), single_bit_mux->loc);
+                muxes[i] = (nnode_t **)vtr::calloc(num_of_muxes, sizeof(nnode_t *));
+                output_signals[i] = init_signal_list();
+
+                // single_bit_mux->input_pins[i] === selector[i]
+                npin_t *selector_pin = single_bit_mux->input_pins[selector_width - i - 1];
+
+                /* iterating over each single bit 2-mux to connect inputs */
+                for (int j = 0; j < num_of_muxes; j++) {
+
+                    muxes[i][j] = make_2port_gate(SMUX_2, 1, 2, 1, single_bit_mux, mark);
+
+                    if (j != num_of_muxes - 1)
+                        add_input_pin_to_node(muxes[i][j], copy_input_npin(selector_pin), 0);
+                    else
+                        remap_pin_to_new_node(selector_pin, muxes[i][j], 0);
+
+                    /* connecting the single bit mux input pins into decoded 2-Muxes */
+                    if (i == 0) {
+                        remap_pin_to_new_node(single_bit_mux->input_pins[port_offset + j], muxes[i][j], 1);
+
+                        remap_pin_to_new_node(single_bit_mux->input_pins[port_offset + j + (num_expressions / 2)], muxes[i][j], 2);
+                    }
+                    /* connecting the outputs of internal 2-muxes to next level 2-muxes as input */
+                    else {
+                        add_input_pin_to_node(muxes[i][j], output_signals[i - 1]->pins[j], 1);
+
+                        add_input_pin_to_node(muxes[i][j], output_signals[i - 1]->pins[j + num_of_muxes], 2);
+                    }
+
+                    // Connect output pin to related input pin
+                    if (i != selector_width - 1) {
+                        npin_t *new_pin1 = allocate_npin();
+                        npin_t *new_pin2 = allocate_npin();
+                        nnet_t *new_net = allocate_nnet();
+                        new_net->name = make_full_ref_name(NULL, NULL, NULL, muxes[i][j]->name, j);
+                        /* hook the output pin into the node */
+                        add_output_pin_to_node(muxes[i][j], new_pin1, 0);
+                        /* hook up new pin 1 into the new net */
+                        add_driver_pin_to_net(new_net, new_pin1);
+                        /* hook up the new pin 2 to this new net */
+                        add_fanout_pin_to_net(new_net, new_pin2);
+
+                        // Storing the output pins of the current mux stage as the input of the next one
+                        add_pin_to_signal_list(output_signals[i], new_pin2);
+
+                    } else {
+                        remap_pin_to_new_node(single_bit_mux->output_pins[j], muxes[i][j], 0);
+                    }
+                }
+            }
+
+            // CLEAN UP per single mux
+            for (int i = 0; i < selector_width; i++) {
+                vtr::free(muxes[i]);
+            }
+            vtr::free(muxes);
+
+            for (int i = 0; i < selector_width; i++) {
+                free_signal_list(output_signals[i]);
+            }
+            vtr::free(output_signals);
+
+            // to free each single mux node
+            free_nnode(single_bit_mux);
+        }
+    }
+
+    // CLEAN UP
+    vtr::free(single_bit_muxes);
+    vtr::free(name);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_not_logic )
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_not_logic(nnode_t *node, short mark, netlist_t * /*netlist*/)
+{
+    int width = node->num_input_pins;
+    nnode_t **new_not_cells;
+
+    new_not_cells = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * width);
+
+    for (int i = 0; i < width; i++) {
+        new_not_cells[i] = make_not_gate(node, mark);
+    }
+
+    /* connect inputs and outputs */
+    for (int i = 0; i < width; i++) {
+        /* Joining the inputs to the new soft NOT GATES */
+        remap_pin_to_new_node(node->input_pins[i], new_not_cells[i], 0);
+        remap_pin_to_new_node(node->output_pins[i], new_not_cells[i], 0);
+    }
+
+    vtr::free(new_not_cells);
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: eliminate_buffer )
+ * 	Buffers just pass through signals
+ * 	Returns true if the buffer could be eliminated
+ *-------------------------------------------------------------------------------------------*/
+bool eliminate_buffer(nnode_t *node, short, netlist_t *)
+{
+    bool buffer_is_removed = true;
+    /* for now we just pass the signals directly through */
+    for (int i = 0; i < node->num_input_pins; i++) {
+        int idx_2_buffer = node->input_pins[i]->pin_net_idx;
+
+        // Dont eliminate the buffer if there are multiple drivers or the AST included it
+        if (node->output_pins[i]->net->num_driver_pins <= 1) {
+            /* join all fanouts of the output net with the input pins net */
+            join_nets(node->input_pins[i]->net, node->output_pins[i]->net);
+
+            /* erase the pointer to this buffer */
+            node->input_pins[i]->net->fanout_pins[idx_2_buffer] = NULL;
+            delete_npin(node->input_pins[i]);
+        } else {
+            buffer_is_removed = false;
+        }
+    }
+
+    // CLEAN UP
+    if (buffer_is_removed) {
+        /* detach output pins from the node */
+        for (int i = 0; i < node->num_output_pins; i++)
+            node->output_pins[i]->node = NULL;
+        free_nnode(node);
+    }
+
+    return buffer_is_removed;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_logical_logic )
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_logical_logic(nnode_t *node, operation_list op, short mark)
+{
+    int port_B_offset;
+    int width_a;
+    int width_b;
+    nnode_t *new_logic_cell;
+    nnode_t *reduction1;
+    nnode_t *reduction2;
+
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->num_output_pins == 1);
+    /* setup the calculations for padding and indexing */
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    port_B_offset = width_a;
+
+    /* instantiate the cells */
+    new_logic_cell = make_1port_logic_gate(op, 2, node, mark);
+    reduction1 = make_1port_logic_gate(BITWISE_OR, width_a, node, mark);
+    reduction2 = make_1port_logic_gate(BITWISE_OR, width_b, node, mark);
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < width_a; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        remap_pin_to_new_node(node->input_pins[i], reduction1, i);
+    }
+    for (int i = 0; i < width_b; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        remap_pin_to_new_node(node->input_pins[i + port_B_offset], reduction2, i);
+    }
+
+    connect_nodes(reduction1, 0, new_logic_cell, 0);
+    connect_nodes(reduction2, 0, new_logic_cell, 1);
+
+    instantiate_bitwise_reduction(reduction1, BITWISE_OR, mark);
+    instantiate_bitwise_reduction(reduction2, BITWISE_OR, mark);
+
+    remap_pin_to_new_node(node->output_pins[0], new_logic_cell, 0);
+    free_nnode(node);
+}
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_bitwise_reduction )
+ * 	Makes 2 input gates to break into bitwise
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_bitwise_reduction(nnode_t *node, operation_list op, short mark)
+{
+    int width_a;
+    nnode_t *new_logic_cell;
+    operation_list cell_op;
+
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 1);
+    oassert(node->output_port_sizes[0] == 1);
+    /* setup the calculations for padding and indexing */
+    width_a = node->input_port_sizes[0];
+
+    switch (op) {
+    case BITWISE_AND:
+    case LOGICAL_AND:
+        cell_op = LOGICAL_AND;
+        break;
+    case BITWISE_OR:
+    case LOGICAL_OR:
+        cell_op = LOGICAL_OR;
+        break;
+    case BITWISE_NAND:
+    case LOGICAL_NAND:
+        cell_op = LOGICAL_NAND;
+        break;
+    case BITWISE_NOR:
+    case LOGICAL_NOR:
+        cell_op = LOGICAL_NOR;
+        break;
+    case BITWISE_XNOR:
+    case LOGICAL_XNOR:
+        cell_op = LOGICAL_XNOR;
+        break;
+    case BITWISE_XOR:
+    case LOGICAL_XOR:
+        cell_op = LOGICAL_XOR;
+        break;
+    default:
+        cell_op = NO_OP;
+        oassert(false);
+        break;
+    }
+    /* instantiate the cells */
+    new_logic_cell = make_1port_logic_gate(cell_op, width_a, node, mark);
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < width_a; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        remap_pin_to_new_node(node->input_pins[i], new_logic_cell, i);
+    }
+
+    remap_pin_to_new_node(node->output_pins[0], new_logic_cell, 0);
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_bitwise_logic )
+ * 	Makes 2 input gates to break into bitwise
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_bitwise_logic(nnode_t *node, operation_list op, short mark, netlist_t *netlist)
+{
+    operation_list cell_op;
+    if (!node)
+        return;
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes >= 2);
+
+    switch (op) {
+    case BITWISE_AND:
+        cell_op = LOGICAL_AND;
+        break;
+    case BITWISE_OR:
+        cell_op = LOGICAL_OR;
+        break;
+    case BITWISE_NAND:
+        cell_op = LOGICAL_NAND;
+        break;
+    case BITWISE_NOR:
+        cell_op = LOGICAL_NOR;
+        break;
+    case BITWISE_XNOR:
+        cell_op = LOGICAL_XNOR;
+        break;
+    case BITWISE_XOR:
+        cell_op = LOGICAL_XOR;
+        break;
+    default:
+        cell_op = NO_OP;
+        oassert(false);
+        break;
+    }
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < node->output_port_sizes[0]; i++) {
+        nnode_t *new_logic_cells = make_nport_gate(cell_op, node->num_input_port_sizes, 1, 1, node, mark);
+        int current_port_offset = 0;
+        /* Joining the inputs to the input 1 of that gate */
+        for (int j = 0; j < node->num_input_port_sizes; j++) {
+            /* IF - this current input will also have a corresponding other input ports then join it to the gate */
+            if (i < node->input_port_sizes[j])
+                remap_pin_to_new_node(node->input_pins[i + current_port_offset], new_logic_cells, j);
+
+            /* ELSE - the input does not exist, so this answer goes right through */
+            else
+                add_input_pin_to_node(new_logic_cells, get_zero_pin(netlist), j);
+
+            current_port_offset += node->input_port_sizes[j];
+        }
+
+        remap_pin_to_new_node(node->output_pins[i], new_logic_cells, 0);
+    }
+
+    free_nnode(node);
+}
+
+/*--------------------------------------------------------------------------
+ * (function: instantiate_add_w_carry )
+ * 	This is for soft addition in output formats that don't handle
+ *	multi-output logic functions (BLIF).  We use one function for the
+ *	add, and one for the carry.
+ *------------------------------------------------------------------------*/
+void instantiate_add_w_carry(nnode_t *node, short mark, netlist_t *netlist)
+{
+    // define locations in array when fetching pins
+    const int out = 0, input_a = 1, input_b = 2, pinout_count = 3;
+
+    oassert(node->num_input_pins > 0);
+
+    int *width = (int *)vtr::malloc(pinout_count * sizeof(int));
+
+    if (node->num_input_port_sizes == 2)
+        width[out] = node->output_port_sizes[0];
+    else
+        width[out] = node->num_output_pins;
+
+    width[input_a] = node->input_port_sizes[0];
+    width[input_b] = node->input_port_sizes[1];
+
+    instantiate_add_w_carry_block(width, node, mark, netlist, 0);
+
+    vtr::free(width);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_sub_w_carry )
+ * 	This subtraction is intended for sof subtraction with output formats that can't handle
+ * 	multi output logic functions.  We split the add and the carry over two logic functions.
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_sub_w_carry(nnode_t *node, short mark, netlist_t *netlist)
+{
+    // define locations in array when fetching pins
+    const int out = 0, input_a = 1, input_b = 2, pinout_count = 3;
+
+    oassert(node->num_input_pins > 0);
+
+    int *width = (int *)vtr::malloc(pinout_count * sizeof(int));
+    width[out] = node->output_port_sizes[0];
+
+    if (node->num_input_port_sizes == 1) {
+        width[input_a] = 0;
+        width[input_b] = node->input_port_sizes[0];
+    } else if (node->num_input_port_sizes == 2) {
+        width[input_a] = node->input_port_sizes[0];
+        width[input_b] = node->input_port_sizes[1];
+    }
+
+    instantiate_add_w_carry_block(width, node, mark, netlist, 1);
+
+    vtr::free(width);
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: instantiate_sub_w_borrow )
+ *
+ * @brief instantiating a single bit subtraction circuit with borrow_in and borrow_out
+ *
+ * @param node pointing to a logical not node
+ * @param mark unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_sub_w_borrow(nnode_t *node, short mark, netlist_t *netlist)
+{
+    /* to validate the input otuptu ports */
+    oassert(node->num_input_port_sizes > 1);
+    oassert(node->num_output_port_sizes > 0);
+    /* to validate the size of input output pins */
+    oassert(node->input_port_sizes[0] == 1);
+    oassert(node->input_port_sizes[1] == 1);
+    oassert(node->output_port_sizes[0] == 1);
+
+    /* to implement the sub 3 logic */
+    instantiate_sub_w_borrow_block(node, mark, netlist);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function:  instantiate_unary_sub )
+ *	Does 2's complement which is the equivalent of a unary subtraction as a HW implementation.
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_unary_sub(nnode_t *node, short mark, netlist_t *netlist) { instantiate_sub_w_carry(node, mark, netlist); }
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_EQUAL )
+ *	Builds the hardware for an equal comparison by building EQ for parallel lines and then
+ *	taking them all through an AND tree.
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_EQUAL(nnode_t *node, operation_list type, short mark, netlist_t *netlist)
+{
+    int width_a;
+    int width_b;
+    int width_max;
+    int port_B_offset;
+    nnode_t *compare;
+    nnode_t *combine;
+
+    oassert(node->num_output_pins == 1);
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width_max = width_a > width_b ? width_a : width_b;
+    port_B_offset = width_a;
+
+    /* build an xnor bitwise XNOR */
+    if (type == LOGICAL_EQUAL) {
+        compare = make_2port_gate(LOGICAL_XNOR, width_a, width_b, width_max, node, mark);
+        combine = make_1port_logic_gate(LOGICAL_AND, width_max, node, mark);
+    } else {
+        compare = make_2port_gate(LOGICAL_XOR, width_a, width_b, width_max, node, mark);
+        combine = make_1port_logic_gate(LOGICAL_OR, width_max, node, mark);
+    }
+    /* build an and bitwise AND */
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < width_max; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        if (i < width_a) {
+            if (i < width_b) {
+                /* IF - this current input will also have a corresponding b_port input then join it to the gate */
+                remap_pin_to_new_node(node->input_pins[i], compare, i);
+            } else {
+                /* ELSE - the B input does not exist, so this answer goes right through */
+                add_input_pin_to_node(compare, get_zero_pin(netlist), i);
+            }
+        }
+
+        if (i < width_b) {
+            if (i < width_a) {
+                /* IF - this current input will also have a corresponding a_port input then join it to the gate */
+                /* Joining the inputs to the input 2 of that gate */
+                remap_pin_to_new_node(node->input_pins[i + port_B_offset], compare, i + port_B_offset);
+            } else {
+                /* ELSE - the A input does not exist, so this answer goes right through */
+                add_input_pin_to_node(compare, get_zero_pin(netlist), i + port_B_offset);
+            }
+        }
+
+        /* hook it up to the logcial AND */
+        connect_nodes(compare, i, combine, i);
+    }
+
+    /* join that gate to the output */
+    remap_pin_to_new_node(node->output_pins[0], combine, 0);
+
+    if (type == LOGICAL_EQUAL)
+        instantiate_bitwise_logic(compare, BITWISE_XNOR, mark, netlist);
+    else
+        instantiate_bitwise_logic(compare, BITWISE_XOR, mark, netlist);
+    /* Don't need to instantiate a Logic and gate since it is a function itself */
+
+    oassert(combine->num_output_pins == 1);
+
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_GT )
+ *	Defines the HW needed for greter than equal with EQ, GT, AND and OR gates to create
+ *	the appropriate logic function.
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_GT(nnode_t *node, operation_list type, short mark, netlist_t *netlist)
+{
+    int width_a;
+    int width_b;
+    int width_max;
+    int port_A_offset;
+    int port_B_offset;
+    int port_A_index;
+    int port_B_index;
+    int index = 0;
+    nnode_t *xor_gate = NULL;
+    nnode_t *logical_or_gate;
+    nnode_t **or_cells;
+    nnode_t **gt_cells;
+
+    oassert(node->num_output_pins == 1);
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->input_port_sizes[0] == node->input_port_sizes[1]);
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    width_max = width_a > width_b ? width_a : width_b;
+
+    /* swaps ports A and B */
+    if (type == GT) {
+        port_A_offset = 0;
+        port_B_offset = width_a;
+        port_A_index = 0;
+        port_B_index = width_a - 1;
+    } else if (type == LT) {
+        port_A_offset = width_b;
+        port_B_offset = 0;
+        port_A_index = width_b - 1;
+        port_B_index = 0;
+    } else {
+        port_A_offset = 0;
+        port_B_offset = 0;
+        port_A_index = 0;
+        port_B_index = 0;
+        error_message(NETLIST, node->loc, "Invalid node type %s in instantiate_GT\n", node_name_based_on_op(node));
+    }
+
+    if (width_max > 1) {
+        /* xor gate identifies if any bits don't match */
+        xor_gate = make_2port_gate(LOGICAL_XOR, width_a - 1, width_b - 1, width_max - 1, node, mark);
+    }
+
+    /* collects all the GT signals and determines if gt */
+    logical_or_gate = make_1port_logic_gate(LOGICAL_OR, width_max, node, mark);
+    /* collects a chain if any 1 happens than the GT cells output 0 */
+    or_cells = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * width_max - 1);
+    /* each cell checks if A > B and sends out a 1 if history has no 1s (3rd input) */
+    gt_cells = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * width_max);
+
+    for (int i = 0; i < width_max; i++) {
+        gt_cells[i] = make_3port_gate(GT, 1, 1, 1, 1, node, mark);
+        if (i < width_max - 1) {
+            or_cells[i] = make_2port_gate(LOGICAL_OR, 1, 1, 1, node, mark);
+        }
+    }
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < width_max; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        if (i < width_a) {
+            /* IF - this current input will also have a corresponding b_port input then join it to the gate */
+            remap_pin_to_new_node(node->input_pins[i + port_A_offset], gt_cells[i], 0);
+            if (i > 0)
+                add_input_pin_to_node(xor_gate, copy_input_npin(gt_cells[i]->input_pins[0]), index + port_A_index);
+        } else {
+            /* ELSE - the B input does not exist, so this answer goes right through */
+            add_input_pin_to_node(gt_cells[i], get_zero_pin(netlist), 0);
+            if (i > 0)
+                add_input_pin_to_node(xor_gate, get_zero_pin(netlist), index + port_A_index);
+        }
+
+        if (i < width_b) {
+            /* IF - this current input will also have a corresponding a_port input then join it to the gate */
+            /* Joining the inputs to the input 2 of that gate */
+            remap_pin_to_new_node(node->input_pins[i + port_B_offset], gt_cells[i], 1);
+            if (i > 0)
+                add_input_pin_to_node(xor_gate, copy_input_npin(gt_cells[i]->input_pins[1]), index + port_B_index);
+        } else {
+            /* ELSE - the A input does not exist, so this answer goes right through */
+            add_input_pin_to_node(gt_cells[i], get_zero_pin(netlist), 1);
+            if (i > 0)
+                add_input_pin_to_node(xor_gate, get_zero_pin(netlist), index + port_B_index);
+        }
+
+        if (i < width_max - 1) {
+            /* number of OR gates */
+            if (i < width_max - 2) {
+                /* connect the msb or to the next lower bit */
+                connect_nodes(or_cells[i + 1], 0, or_cells[i], 1);
+            } else {
+                /* deal with the first greater than test which autom gets a zero */
+                add_input_pin_to_node(or_cells[i], get_zero_pin(netlist), 1);
+            }
+            if (width_max > 1) {
+                /* get all the equals with the or gates */
+                connect_nodes(xor_gate, i, or_cells[i], 0);
+            }
+
+            connect_nodes(or_cells[i], 0, gt_cells[i], 2);
+
+        } else {
+            /* deal with the first greater than test which autom gets a zero */
+            add_input_pin_to_node(gt_cells[i], get_zero_pin(netlist), 2);
+        }
+
+        /* hook it up to the logcial AND */
+        connect_nodes(gt_cells[i], 0, logical_or_gate, i);
+
+        if (i > 0) {
+            index++;
+        }
+    }
+
+    /* join that gate to the output */
+    remap_pin_to_new_node(node->output_pins[0], logical_or_gate, 0);
+    oassert(logical_or_gate->num_output_pins == 1);
+    if (xor_gate != NULL) {
+        instantiate_bitwise_logic(xor_gate, BITWISE_XOR, mark, netlist);
+    }
+
+    vtr::free(gt_cells);
+    vtr::free(or_cells);
+    free_nnode(node);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: instantiate_GE )
+ *	Defines the HW needed for greter than equal with EQ, GT, AND and OR gates to create
+ *	the appropriate logic function.
+ *-------------------------------------------------------------------------------------------*/
+void instantiate_GE(nnode_t *node, operation_list type, short mark, netlist_t *netlist)
+{
+    int width_a;
+    int width_b;
+    int width_max;
+    int port_B_offset;
+    int port_A_offset;
+    nnode_t *equal;
+    nnode_t *compare;
+    nnode_t *logical_or_final_gate;
+
+    oassert(node->num_output_pins == 1);
+    oassert(node->num_input_pins > 0);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->input_port_sizes[0] == node->input_port_sizes[1]);
+    width_a = node->input_port_sizes[0];
+    width_b = node->input_port_sizes[1];
+    oassert(width_a == width_b);
+    width_max = width_a > width_b ? width_a : width_b;
+
+    port_A_offset = 0;
+    port_B_offset = width_a;
+
+    /* build an xnor bitwise XNOR */
+    equal = make_2port_gate(LOGICAL_EQUAL, width_a, width_b, 1, node, mark);
+
+    if (type == GTE)
+        compare = make_2port_gate(GT, width_a, width_b, 1, node, mark);
+    else
+        compare = make_2port_gate(LT, width_a, width_b, 1, node, mark);
+
+    logical_or_final_gate = make_1port_logic_gate(LOGICAL_OR, 2, node, mark);
+
+    /* connect inputs.  In the case that a signal is smaller than the other then zero pad */
+    for (int i = 0; i < width_max; i++) {
+        /* Joining the inputs to the input 1 of that gate */
+        if (i < width_a) {
+            /* IF - this current input will also have a corresponding b_port input then join it to the gate */
+            remap_pin_to_new_node(node->input_pins[i + port_A_offset], equal, i + port_A_offset);
+            add_input_pin_to_node(compare, copy_input_npin(equal->input_pins[i + port_A_offset]), i + port_A_offset);
+        } else {
+            /* ELSE - the B input does not exist, so this answer goes right through */
+            add_input_pin_to_node(equal, get_zero_pin(netlist), i + port_A_offset);
+            add_input_pin_to_node(compare, get_zero_pin(netlist), i + port_A_offset);
+        }
+
+        if (i < width_b) {
+            /* IF - this current input will also have a corresponding a_port input then join it to the gate */
+            /* Joining the inputs to the input 2 of that gate */
+            remap_pin_to_new_node(node->input_pins[i + port_B_offset], equal, i + port_B_offset);
+            add_input_pin_to_node(compare, copy_input_npin(equal->input_pins[i + port_B_offset]), i + port_B_offset);
+        } else {
+            /* ELSE - the A input does not exist, so this answer goes right through */
+            add_input_pin_to_node(equal, get_zero_pin(netlist), i + port_B_offset);
+            add_input_pin_to_node(compare, get_zero_pin(netlist), i + port_B_offset);
+        }
+    }
+    connect_nodes(equal, 0, logical_or_final_gate, 0);
+    connect_nodes(compare, 0, logical_or_final_gate, 1);
+
+    /* join that gate to the output */
+    remap_pin_to_new_node(node->output_pins[0], logical_or_final_gate, 0);
+    oassert(logical_or_final_gate->num_output_pins == 1);
+
+    /* make the two intermediate gates */
+    instantiate_EQUAL(equal, LOGICAL_EQUAL, mark, netlist);
+
+    if (type == GTE)
+        instantiate_GT(compare, GT, mark, netlist);
+    else
+        instantiate_GT(compare, LT, mark, netlist);
+
+    free_nnode(node);
+}
+
+/**
+ * --------------------------------------------------------------------------
+ * (function: instantiate_shift)
+ *
+ * @brief instantiate shift node based on the type
+ * of given shift varaible.
+ *
+ * @note first_signal 'SHIFT_OP' second_signal
+ *
+ * @param node pointer to the multipication netlist node
+ *
+ * @return output signal
+ * -------------------------------------------------------------------------*/
+void instantiate_shift(nnode_t *node, short mark, netlist_t *netlist)
+{
+    /* validate number and size of input ports */
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->input_port_sizes[0] == node->input_port_sizes[1]);
+
+    int operand_width = node->input_port_sizes[0];
+    int shift_width = node->input_port_sizes[1];
+    /* shift signal */
+    signal_list_t *shift_signal = init_signal_list();
+    for (int i = 0; i < shift_width; i++) {
+        add_pin_to_signal_list(shift_signal, node->input_pins[operand_width + i]);
+    }
+
+    /* check for constant and variable shift operation */
+    if (is_constant_signal(shift_signal, netlist)) {
+        /* the shift signal is constant, no need to implement the complex variable circuitry */
+        instantiate_constant_shift(node, node->type, mark, netlist);
+    } else {
+        /* the shift signal is variable, variable shift will be instantiated */
+        instantiate_variable_shift(node, node->type, mark, netlist);
+    }
+
+    // CLEAN UP
+    free_signal_list(shift_signal);
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: instantiate_constant_shift )
+ *
+ * @brief instantiate shift node that the shift signals
+ * is a CONSTANT signals. The constant shift implements
+ * the shift operation using manual signal shift with add
+ *
+ * @param node shift node
+ * @param type shift type [SL,SR, ASL, and ASR]
+ * @param mark traversal number
+ * @param netlist pointer to the current netlist
+ *-------------------------------------------------------------------------------------------*/
+static void instantiate_constant_shift(nnode_t *node, operation_list type, short mark, netlist_t *netlist)
+{
+    /* validate number and size of input ports */
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->input_port_sizes[0] == node->input_port_sizes[1]);
+
+    int operand_width = node->input_port_sizes[0];
+    int shift_width = node->input_port_sizes[1];
+    int output_width = node->output_port_sizes[0];
+
+    /* operand signal */
+    signal_list_t *operand_signal = init_signal_list();
+    for (int i = 0; i < operand_width; i++) {
+        add_pin_to_signal_list(operand_signal, node->input_pins[i]);
+    }
+    /* shift signal */
+    signal_list_t *shift_signal = init_signal_list();
+    for (int i = 0; i < shift_width; i++) {
+        add_pin_to_signal_list(shift_signal, node->input_pins[operand_width + i]);
+    }
+
+    /* validate constant shift signal */
+    oassert(is_constant_signal(shift_signal, netlist));
+
+    /* shift the operand by shift_size*/
+    signal_list_t *result = init_signal_list();
+    /* record the size of the shift */
+    int pad_bit = operand_width - 1;
+    /* calculate the value of shift signal */
+    long shift_size = constant_signal_value(shift_signal, netlist);
+
+    switch (type) {
+    case SL:
+    case ASL: {
+        /* connect ZERO to outputs that don't have inputs connected */
+        for (int i = 0; i < shift_size; i++) {
+            if (i < output_width) {
+                // connect 0 to lower outputs
+                add_pin_to_signal_list(result, get_zero_pin(netlist));
+            }
+        }
+
+        /* connect inputs to outputs */
+        for (int i = 0; i < output_width - shift_size; i++) {
+            if (i < operand_width) {
+                npin_t *pin = operand_signal->pins[i];
+                // connect higher output pin to lower input pin
+                add_pin_to_signal_list(result, pin);
+                /* detach from the old node */
+                pin->node->input_pins[pin->pin_node_idx] = NULL;
+            } else {
+                /* pad with zero pins */
+                npin_t *extension_pin = get_zero_pin(netlist);
+                add_pin_to_signal_list(result, extension_pin);
+            }
+        }
+        break;
+    }
+    case SR: // fallthrough
+    case ASR: {
+        for (int i = shift_size; i < operand_width; i++) {
+            npin_t *pin = operand_signal->pins[i];
+            // connect higher output pin to lower input pin
+            if (i - shift_size < output_width) {
+                add_pin_to_signal_list(result, pin);
+                pin->node->input_pins[pin->pin_node_idx] = NULL;
+            }
+        }
+
+        /* Extend pad_bit to outputs that don't have inputs connected */
+        for (int i = output_width - 1; i >= operand_width - shift_size; i--) {
+            npin_t *extension_pin = NULL;
+            if (node->related_ast_node && node->attributes->port_a_signed == SIGNED && node->type == ASR) {
+                /* for signed values padding will be with last pin */
+                extension_pin = copy_input_npin(operand_signal->pins[pad_bit]);
+            } else {
+                /* otherwise result will be padded with zero pins */
+                extension_pin = get_zero_pin(netlist);
+            }
+
+            add_pin_to_signal_list(result, extension_pin);
+        }
+        break;
+    }
+    default:
+        error_message(NETLIST, node->loc, "%s", "Operation not supported by Odin\n");
+        break;
+    }
+
+    for (int i = 0; i < output_width; i++) {
+        /* create a buf node to drive output pins */
+        nnode_t *buf_node = make_1port_gate(BUF_NODE, 1, 1, node, mark);
+        /* add result as inout pins */
+        add_input_pin_to_node(buf_node, result->pins[i], 0);
+
+        /* remap output signals to buf node */
+        remap_pin_to_new_node(node->output_pins[i], buf_node, 0);
+    }
+
+    // CLEAN UP
+    for (int i = 0; i < operand_signal->count; i++) {
+        /* delete unused operand pins */
+        if (operand_signal->pins[i]->node == node)
+            delete_npin(operand_signal->pins[i]);
+    }
+    free_signal_list(operand_signal);
+    for (int i = 0; i < shift_signal->count; i++) {
+        /* delete shift pins */
+        delete_npin(shift_signal->pins[i]);
+    }
+    free_signal_list(shift_signal);
+    free_signal_list(result);
+    free_nnode(node);
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: instantiate_shift )
+ *
+ * @brief instantiate shift node that the shift signals
+ * is not a CONSTANT signals. The variable shift implements
+ * the shift operation using Barrel Shift design
+ *
+ * @param node shift node
+ * @param type shift type [SL,SR, ASL, and ASR]
+ * @param mark traversal number
+ * @param netlist pointer to the current netlist
+ *-------------------------------------------------------------------------------------------*/
+static void instantiate_variable_shift(nnode_t *node, operation_list type, short mark, netlist_t *netlist)
+{
+    /*
+     *   Create mux 2:1
+     *   data1 = SHIFT first_input
+     *   data2 = SHIFT first_input shifted right or left by pow(2,i)
+     *   selector = SHIFT second_input[i]
+     */
+
+    nnode_t ***muxes;
+    signal_list_t *input_pins = init_signal_list();
+    signal_list_t *output_pins;
+    int input_port_width = 0;
+    int output_port_width = 0;
+    int pow_2_by_i = 0;
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        add_pin_to_signal_list(input_pins, node->input_pins[i]);
+    }
+
+    output_pins = input_pins;
+    output_port_width = node->output_port_sizes[0];
+    input_port_width = node->input_port_sizes[0];
+    muxes = (nnode_t ***)vtr::malloc(sizeof(nnode_t **) * (input_port_width));
+
+    for (int i = 0; i < input_port_width; i++) {
+        muxes[i] = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * (input_port_width));
+        for (int j = 0; j < input_port_width; j++) {
+            muxes[i][j] = make_2port_gate(SMUX_2, 1, 2, 1, node, mark);
+            /* add selector pin */
+            add_input_pin_to_node(muxes[i][j], copy_input_npin(node->input_pins[input_port_width + i]), 0);
+        }
+    }
+
+    for (int i = 0; i < input_port_width; i++) {
+        pow_2_by_i = shift_left_value_with_overflow_check(0x1, i, node->loc);
+        /*
+         * Limit shift value of barrel design to max at input_port_width,
+         * since after this value we should extend based on extension bit.
+         * Also, checking the overflow of pow_2_by_i
+         */
+        int shift_size = (pow_2_by_i > input_port_width || pow_2_by_i < 0) ? input_port_width : pow_2_by_i;
+
+        input_pins = output_pins;
+
+        if (type == SR || type == ASR) {
+            /*
+             * Logical variable shift right
+             * or
+             * arithmetic variable shift right
+             */
+
+            // shift by pow(2,i) and connect the output of the previous stage mux as the second input to the next stage mux
+            for (int j = shift_size; j < input_port_width; j++)
+                add_input_pin_to_node(muxes[i][j - shift_size], copy_input_npin(input_pins->pins[j]), 2);
+            // connect the sign bit of the previous output as extension bits
+            int pad_bit = input_port_width - 1;
+            for (int j = 0; j < shift_size; j++) {
+                if (node->attributes->port_a_signed == SIGNED && type == ASR) {
+                    add_input_pin_to_node(muxes[i][j + input_port_width - shift_size], copy_input_npin(input_pins->pins[pad_bit]), 2);
+                } else {
+                    add_input_pin_to_node(muxes[i][j + input_port_width - shift_size], get_zero_pin(netlist), 2);
+                }
+            }
+
+        } else {
+            /*
+             * Logical variable shift left
+             * or
+             * arithmetic variable shift left
+             */
+
+            // shift by pow(2,i) and connect the output of the previous stage mux as the second input to the next stage mux
+            for (int j = 0; j < input_port_width - shift_size; j++)
+                add_input_pin_to_node(muxes[i][j + shift_size], copy_input_npin(input_pins->pins[j]), 2);
+
+            // connect the zero as extension bits
+            for (int j = 0; j < shift_size; j++)
+                add_input_pin_to_node(muxes[i][j], get_zero_pin(netlist), 2);
+        }
+
+        for (int j = 0; j < input_port_width; j++) {
+            // connect the output of the previous stage mux as the first input to the next stage mux
+            add_input_pin_to_node(muxes[i][j], input_pins->pins[j], 1);
+        }
+
+        free_signal_list(input_pins);
+        output_pins = init_signal_list();
+        // Connect output pin to related input pin
+        for (int j = 0; j < input_port_width; j++) {
+            if (i != input_port_width - 1) {
+                npin_t *new_pin1 = allocate_npin();
+                npin_t *new_pin2 = allocate_npin();
+                nnet_t *new_net = allocate_nnet();
+                new_net->name = make_full_ref_name(NULL, NULL, NULL, muxes[i][j]->name, j);
+                /* hook the output pin into the node */
+                add_output_pin_to_node(muxes[i][j], new_pin1, 0);
+                /* hook up new pin 1 into the new net */
+                add_driver_pin_to_net(new_net, new_pin1);
+                /* hook up the new pin 2 to this new net */
+                add_fanout_pin_to_net(new_net, new_pin2);
+
+                // Storing the output pins of the current mux stage as the input of the next one
+                add_pin_to_signal_list(output_pins, new_pin2);
+
+            } else {
+                if (j < output_port_width)
+                    remap_pin_to_new_node(node->output_pins[j], muxes[i][j], 0);
+            }
+        }
+    }
+
+    free_signal_list(output_pins);
+    for (int i = 0; i < input_port_width; i++) {
+        vtr::free(muxes[i]);
+    }
+    vtr::free(muxes);
+}
diff --git a/parmys-plugin/mapping/partial_map.h b/parmys-plugin/mapping/partial_map.h
new file mode 100644
index 000000000..6371fe0e8
--- /dev/null
+++ b/parmys-plugin/mapping/partial_map.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _PARTIAL_MAP_H_
+#define _PARTIAL_MAP_H_
+
+void partial_map_top(netlist_t *netlist);
+void instantiate_add_w_carry(nnode_t *node, short mark, netlist_t *netlist);
+void instantiate_multi_port_mux(nnode_t *node, short mark, netlist_t *netlist);
+void depth_first_traversal_to_partial_map(short marker_value, netlist_t *netlist);
+void instantiate_multi_port_n_bits_mux(nnode_t *node, short mark, netlist_t * /*netlist*/);
+
+#endif // _PARTIAL_MAP_H_
diff --git a/parmys-plugin/netlist/netlist_cleanup.cc b/parmys-plugin/netlist/netlist_cleanup.cc
new file mode 100644
index 000000000..d336b5ccd
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_cleanup.cc
@@ -0,0 +1,332 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+#include <algorithm> // std::fill
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "netlist_utils.h"
+#include "odin_ii.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+USING_YOSYS_NAMESPACE
+
+bool coarsen_cleanup;
+
+/* Used in the nnode_t.node_data field to mark if the node was already visited
+ * during a forward or backward sweep traversal or the removal phase */
+int _visited_forward, _visited_backward, _visited_removal;
+#define VISITED_FORWARD ((void *)&_visited_forward)
+#define VISITED_BACKWARD ((void *)&_visited_backward)
+#define VISITED_REMOVAL ((void *)&_visited_removal)
+
+/* Simple linked list of nodes structure */
+struct node_list_t {
+    nnode_t *node;
+    struct node_list_t *next;
+};
+
+node_list_t useless_nodes;                       // List of the nodes to be removed
+node_list_t *removal_list_next = &useless_nodes; // Tail of the nodes to be removed
+
+node_list_t addsub_nodes;                              // List of the adder/subtractor nodes
+node_list_t *addsub_list_next = &addsub_nodes;         // Tail of the adder/subtractor node list
+long long num_removed_nodes[operation_list_END] = {0}; // List of removed nodes by type
+
+/* Function declarations */
+node_list_t *insert_node_list(node_list_t *node_list, nnode_t *node);
+void traverse_backward(nnode_t *node);
+void traverse_forward(nnode_t *node, int toplevel, int remove_me);
+void mark_output_dependencies(netlist_t *netlist);
+void identify_unused_nodes(netlist_t *netlist);
+void remove_unused_nodes(node_list_t *remove);
+void calculate_addsub_statistics(node_list_t *addsub);
+void remove_unused_logic(netlist_t *netlist);
+void count_node_type(nnode_t *node);
+void report_removed_nodes(long long *node_list);
+
+node_list_t *insert_node_list(node_list_t *node_list, nnode_t *node)
+{
+    node_list->node = node;
+    node_list->next = (node_list_t *)vtr::calloc(1, sizeof(node_list_t));
+    return node_list->next;
+}
+
+/* Traverse the netlist backwards, moving from outputs to inputs */
+void traverse_backward(nnode_t *node)
+{
+    if (node->node_data == VISITED_BACKWARD)
+        return;                         // Already visited
+    node->node_data = VISITED_BACKWARD; // Mark as visited
+    for (int i = 0; i < node->num_input_pins; i++) {
+        // ensure this net has a driver (i.e. skip undriven outputs)
+        for (int j = 0; j < node->input_pins[i]->net->num_driver_pins; j++) {
+            if (node->input_pins[i]->net->driver_pins[j]->node)
+                // Visit the drivers of this node
+                traverse_backward(node->input_pins[i]->net->driver_pins[j]->node);
+        }
+    }
+}
+
+/* Traverse the netlist forward, moving from inputs to outputs.
+ * Adds nodes that do not affect any outputs to the useless_nodes list
+ * Arguments:
+ * 	node: the current node in the netlist
+ * 	toplevel: are we at one of the top-level nodes? (GND, VCC, PAD or INPUT)
+ * 	remove_me: should the current node be removed?
+ * */
+void traverse_forward(nnode_t *node, int toplevel, int remove_me)
+{
+    if (node == NULL)
+        return; // Shouldn't happen, but check just in case
+    if (node->node_data == VISITED_FORWARD)
+        return; // Already visited, shouldn't happen anyway
+
+    /* We want to remove this node if either its parent was removed,
+     * or if it was not visited on the backwards sweep */
+    remove_me = remove_me || ((node->node_data != VISITED_BACKWARD) && (toplevel == false));
+
+    /* Mark this node as visited */
+    node->node_data = VISITED_FORWARD;
+
+    if (remove_me) {
+        /* Add this node to the list of nodes to remove */
+        removal_list_next = insert_node_list(removal_list_next, node);
+        count_node_type(node);
+    }
+
+    if (node->type == ADD || node->type == MINUS) {
+        // check if adders/subtractors are starting using a global gnd/vcc node or a pad node
+        auto ADDER_START_NODE = PAD_NODE;
+        if (configuration.adder_cin_global) {
+            if (node->type == ADD)
+                ADDER_START_NODE = GND_NODE;
+            else
+                ADDER_START_NODE = VCC_NODE;
+        }
+        oassert(node->input_pins[node->num_input_pins - 1]->net->num_driver_pins == 1);
+        /* Check if we've found the head of an adder or subtractor chain */
+        if (node->input_pins[node->num_input_pins - 1]->net->driver_pins[0]->node->type == ADDER_START_NODE) {
+            addsub_list_next = insert_node_list(addsub_list_next, node);
+        }
+    }
+
+    /* Iterate through every fanout node */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        if (node->output_pins[i] && node->output_pins[i]->net) {
+            for (int j = 0; j < node->output_pins[i]->net->num_fanout_pins; j++) {
+                if (node->output_pins[i]->net->fanout_pins[j]) {
+                    nnode_t *child = node->output_pins[i]->net->fanout_pins[j]->node;
+                    if (child) {
+                        /* If this child hasn't already been visited, visit it now */
+                        if (child->node_data != VISITED_FORWARD) {
+                            traverse_forward(child, false, remove_me);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/* Start at each of the top level output nodes and traverse backwards to the inputs
+ * to determine which nodes have an effect on the outputs */
+void mark_output_dependencies(netlist_t *netlist)
+{
+    for (int i = 0; i < netlist->num_top_output_nodes; i++) {
+        traverse_backward(netlist->top_output_nodes[i]);
+    }
+}
+
+/* Traversed the netlist forward from the top level inputs and special nodes
+ * (VCC, GND, PAD) */
+void identify_unused_nodes(netlist_t *netlist)
+{
+    useless_nodes.node = NULL;
+    useless_nodes.next = NULL;
+
+    addsub_nodes.node = NULL;
+    addsub_nodes.next = NULL;
+
+    traverse_forward(netlist->gnd_node, true, false);
+    traverse_forward(netlist->vcc_node, true, false);
+    traverse_forward(netlist->pad_node, true, false);
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        traverse_forward(netlist->top_input_nodes[i], true, false);
+    }
+}
+
+/* Note: This does not actually free the unused logic, but simply detaches
+ * it from the rest of the circuit */
+void remove_unused_nodes(node_list_t *remove)
+{
+    while (remove != NULL && remove->node != NULL) {
+        for (int i = 0; i < remove->node->num_input_pins; i++) {
+            npin_t *input_pin = remove->node->input_pins[i];
+            /* Remove the fanout pin from the net */
+            if (input_pin)
+                input_pin->net->fanout_pins[input_pin->pin_net_idx] = NULL;
+        }
+        remove->node->node_data = VISITED_REMOVAL;
+        remove = remove->next;
+    }
+}
+
+/* Since we are traversing the entire netlist anyway, we can use this
+ * opportunity to keep track of the heads of adder/subtractors chains
+ * and then compute statistics on them */
+long adder_chain_count = 0;
+long longest_adder_chain = 0;
+long total_adders = 0;
+
+long subtractor_chain_count = 0;
+long longest_subtractor_chain = 0;
+long total_subtractors = 0;
+
+double geomean_addsub_length = 0.0; // Geometric mean of add/sub chain length
+double sum_of_addsub_logs = 0.0;    // Sum of the logarithms of the add/sub chain lengths; used for geomean
+double total_addsub_chain_count = 0.0;
+
+void calculate_addsub_statistics(node_list_t *addsub)
+{
+    while (addsub != NULL && addsub->node != NULL) {
+        int found_tail = false;
+        nnode_t *node = addsub->node;
+        int chain_depth = 0;
+        while (!found_tail) {
+            if (node->node_data == VISITED_REMOVAL) {
+                found_tail = true;
+                break;
+            }
+            chain_depth += 1;
+
+            /* Carry out is always output pin 0 */
+            nnet_t *carry_out_net = node->output_pins[0]->net;
+            if (carry_out_net == NULL || carry_out_net->fanout_pins[0] == NULL)
+                found_tail = true;
+            else
+                node = carry_out_net->fanout_pins[0]->node;
+        }
+        if (chain_depth > 0) {
+            if (node->type == ADD) {
+                adder_chain_count += 1;
+                total_adders += chain_depth;
+                if (chain_depth > longest_adder_chain)
+                    longest_adder_chain = chain_depth;
+            } else if (node->type == MINUS) {
+                subtractor_chain_count += 1;
+                total_subtractors += chain_depth;
+                if (chain_depth > longest_subtractor_chain)
+                    longest_subtractor_chain = chain_depth;
+            }
+
+            sum_of_addsub_logs += log(chain_depth);
+            total_addsub_chain_count += 1.0;
+        }
+
+        addsub = addsub->next;
+    }
+    /* Calculate the geometric mean carry chain length */
+    geomean_addsub_length = exp(sum_of_addsub_logs / total_addsub_chain_count);
+}
+void count_node_type(nnode_t *node)
+{
+    switch (node->type) {
+    case LOGICAL_OR:   // fallthrough
+    case LOGICAL_AND:  // fallthrough
+    case LOGICAL_NOR:  // fallthrough
+    case LOGICAL_NAND: // fallthrough
+    case LOGICAL_XOR:  // fallthrough
+    case LOGICAL_XNOR: // fallthrough
+    case LOGICAL_NOT:  // fallthrough
+        num_removed_nodes[node->type]++;
+        num_removed_nodes[GENERIC]++;
+        break;
+
+    case MUX_2:  // fallthrough
+    case SMUX_2: // fallthrough
+        num_removed_nodes[MUX_2]++;
+        num_removed_nodes[GENERIC]++;
+        break;
+
+    case GENERIC: // fallthrough
+        num_removed_nodes[node->type]++;
+        break;
+
+    case MINUS: // fallthrough
+        /* Minus nodes are built of Add nodes */
+        num_removed_nodes[ADD]++;
+        break;
+
+    case PAD_NODE: // fallthrough
+    case GND_NODE: // fallthrough
+    case VCC_NODE: // fallthrough
+        /* These are irrelevent so we dont output */
+        break;
+
+    case INPUT_NODE:  // fallthrough
+    case OUTPUT_NODE: // fallthrough
+        /* these stay untouched but are not added to the total*/
+        num_removed_nodes[node->type]++;
+        break;
+
+    case CLOCK_NODE: // fallthrough
+    case FF_NODE:    // fallthrough
+    case MULTIPLY:   // fallthrough
+    case ADD:        // fallthrough
+    case MEMORY:     // fallthrough
+    case HARD_IP:    // fallthrough
+        /* these stay untouched */
+        num_removed_nodes[node->type]++;
+        break;
+
+    default:
+        /* everything else is generic */
+        num_removed_nodes[GENERIC]++;
+        break;
+    }
+}
+
+void report_removed_nodes(long long *node_list)
+{
+    // return if there is no removed logic
+    if (!useless_nodes.node)
+        return;
+
+    warning_message(NETLIST, unknown_location, "%s", "Following unused node(s) removed from the netlist:\n");
+    for (int i = 0; i < operation_list_END; i++) {
+        if (node_list[i] > UNUSED_NODE_TYPE) {
+            std::string msg = std::string("Number of removed <") + operation_list_STR[i][ODIN_LONG_STRING] + "> node(s): ";
+            log("%-42s%lld\n", msg.c_str(), node_list[i]);
+        }
+    }
+}
+
+/* Perform the backwards and forward sweeps and remove the unused nodes */
+void remove_unused_logic(netlist_t *netlist)
+{
+    mark_output_dependencies(netlist);
+    identify_unused_nodes(netlist);
+    remove_unused_nodes(&useless_nodes);
+    if (global_args.all_warnings)
+        report_removed_nodes(num_removed_nodes);
+    calculate_addsub_statistics(&addsub_nodes);
+}
\ No newline at end of file
diff --git a/parmys-plugin/netlist/netlist_cleanup.h b/parmys-plugin/netlist/netlist_cleanup.h
new file mode 100644
index 000000000..71a66b187
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_cleanup.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _NETLIST_CLEANUP_H_
+#define _NETLIST_CLEANUP_H_
+
+void remove_unused_logic(netlist_t *netlist);
+
+#endif // _NETLIST_CLEANUP_H_
diff --git a/parmys-plugin/netlist/netlist_statistic.cc b/parmys-plugin/netlist/netlist_statistic.cc
new file mode 100644
index 000000000..3e97652ad
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_statistic.cc
@@ -0,0 +1,390 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <algorithm>
+
+/* for hb */
+#include "multiplier.h"
+
+#include "netlist_statistic.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "vtr_memory.h"
+
+USING_YOSYS_NAMESPACE
+
+static void init(metric_t *m);
+static void print_stats(metric_t *m);
+static void copy(metric_t *dest, metric_t *src);
+
+static void add_to_stat(metric_t *dest, long long branching_factor);
+static void count_node_type(nnode_t *node, netlist_t *netlist);
+
+static metric_t *get_upward_stat(nnet_t *net, netlist_t *netlist, uintptr_t traverse_mark_number);
+static metric_t *get_downward_stat(nnet_t *net, netlist_t *netlist, uintptr_t traverse_mark_number);
+static metric_t *get_upward_stat(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number);
+static metric_t *get_downward_stat(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number);
+
+static metric_t *get_upward_stat(metric_t *destination, nnode_t **node_list, long long node_count, netlist_t *netlist,
+                                 uintptr_t traverse_mark_number);
+
+static void init(metric_t *m)
+{
+    m->min_depth = 0;
+    m->max_depth = 0;
+    m->avg_depth = 0;
+    m->avg_width = 0;
+}
+
+void init_stat(netlist_t *netlist)
+{
+    for (int i = 0; i < operation_list_END; i++) {
+        /* we init to -2 to skip unused elements */
+        netlist->num_of_type[i] = UNUSED_NODE_TYPE;
+    }
+
+    init(&netlist->output_node_stat);
+    netlist->num_of_node = 0;
+    netlist->num_logic_element = 0;
+}
+
+void mixing_optimization_stats(nnode_t *node, netlist_t *netlist)
+{
+    // Reinitialize statistics (to avoid interference)
+    init_stat(netlist);
+    // assuming the optimization is started from the node of the type that
+    // matches the node type
+    switch (node->type) {
+    case MULTIPLY: {
+        stat_t *multiply_stats = get_stats(node, netlist, mult_optimization_traverse_value);
+        node->weight = multiply_stats->downward.max_depth;
+        vtr::free(multiply_stats);
+        break;
+    }
+    default:
+        error_message(NETLIST, unknown_location, "Counting weights for mixing optimization for %i: Hard block type is unimplemented", node->type);
+        break;
+    }
+}
+
+static void print_stats(metric_t *m)
+{
+    log("\n\t%s:%0.4lf\n\t%s: %0.4lf\n\t%s: %0.4lf\n\t%s: %0.4lf\n", "shortest path", m->min_depth, "critical path", m->max_depth, "average path",
+        m->avg_depth, "overall fan-out", m->avg_width);
+}
+_static_unused(print_stats) // quiet warning
+
+  static void copy(metric_t *dest, metric_t *src)
+{
+    if (dest) {
+        init(dest);
+        if (src) {
+            dest->min_depth = src->min_depth;
+            dest->max_depth = src->max_depth;
+            dest->avg_depth = src->avg_depth;
+            dest->avg_width = src->avg_width;
+        }
+    }
+}
+
+static void aggregate(metric_t *dest, metric_t **sources, long long source_count)
+{
+    long long actual_count = 0;
+    init(dest);
+
+    // compute stats from parent
+    for (long long i = 0; sources && i < source_count; i += 1) {
+        metric_t *src = sources[i];
+        if (src) {
+            actual_count += 1;
+            if (dest->min_depth == 0) {
+                dest->min_depth = src->min_depth;
+            } else {
+                dest->min_depth = std::min(src->min_depth, dest->min_depth);
+            }
+            dest->max_depth = std::max(src->max_depth, dest->max_depth);
+            dest->avg_depth += src->avg_depth;
+            dest->avg_width += src->avg_width;
+        }
+    }
+
+    if (actual_count) {
+        dest->avg_depth /= actual_count;
+        dest->avg_width /= actual_count;
+    }
+}
+
+static void add_to_stat(metric_t *dest, long long branching_factor)
+{
+    dest->min_depth += 1;
+    dest->max_depth += 1;
+    dest->avg_depth += 1;
+    dest->avg_width += branching_factor;
+}
+
+static bool traverse(nnode_t *node, uintptr_t traverse_mark_number)
+{
+    bool traverse = (node->traverse_visited != traverse_mark_number);
+    node->traverse_visited = traverse_mark_number;
+    return traverse;
+}
+
+static bool traverse(nnet_t *net, uintptr_t traverse_mark_number)
+{
+    bool traverse = (net->traverse_visited != traverse_mark_number);
+    net->traverse_visited = traverse_mark_number;
+    return traverse;
+}
+
+static void increment_type_count(operation_list op, netlist_t *netlist)
+{
+    if (netlist->num_of_type[op] < 0) {
+        netlist->num_of_type[op] = 0;
+    }
+    netlist->num_of_type[op] += 1;
+}
+static void count_node_type(operation_list op, nnode_t *node, netlist_t *netlist)
+{
+    switch (op) {
+    case LOGICAL_OR:
+    case LOGICAL_AND:
+    case LOGICAL_NOR:
+    case LOGICAL_NAND:
+    case LOGICAL_XOR:
+    case LOGICAL_XNOR:
+    case LOGICAL_NOT: {
+        increment_type_count(op, netlist);
+        count_node_type(GENERIC, node, netlist);
+        break;
+    }
+    case MUX_2: // fallthrough
+    case SMUX_2: {
+        increment_type_count(MUX_2, netlist);
+        count_node_type(GENERIC, node, netlist);
+        break;
+    }
+    case GENERIC:
+        /**
+         * generic a packed into luts
+         * so we **roughly **estimate placements, this allows us
+         * to give predictive placement
+         */
+        if (physical_lut_size > 1 && node->num_input_pins > physical_lut_size) {
+            /* the estimate is based on the width of the node split to fit into the lut*/
+            long long input_width = node->num_input_pins;
+            /* we have to account for the glue logic to join the result down to one pin */
+            while (input_width > 1) {
+                long long logic_element = input_width / physical_lut_size;
+                logic_element += ((input_width % physical_lut_size) != 0);
+                input_width = logic_element;
+                netlist->num_logic_element += logic_element;
+            }
+        } else {
+            netlist->num_logic_element += 1;
+        }
+        increment_type_count(op, netlist);
+        netlist->num_of_node += 1;
+        break;
+
+    case MINUS:
+        /* Minus nodes are built of Add nodes */
+        count_node_type(ADD, node, netlist);
+        break;
+
+    case PAD_NODE:
+    case GND_NODE:
+    case VCC_NODE:
+        /* These are irrelevent so we dont output */
+        break;
+
+    case INPUT_NODE:
+    case OUTPUT_NODE:
+        /* these stay untouched but are not added to the total*/
+        increment_type_count(op, netlist);
+        break;
+
+    case CLOCK_NODE:
+    case FF_NODE:
+    case MULTIPLY:
+    case ADD:
+    case MEMORY:
+    case HARD_IP:
+        /* these stay untouched */
+        increment_type_count(op, netlist);
+        netlist->num_of_node += 1;
+        break;
+
+    default:
+        /* everything else is generic */
+        count_node_type(GENERIC, node, netlist);
+        break;
+    }
+}
+
+static void count_node_type(nnode_t *node, netlist_t *netlist) { count_node_type(node->type, node, netlist); }
+
+static metric_t *get_upward_stat(nnet_t *net, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    metric_t *destination = NULL;
+    if (net) {
+        destination = &(net->stat.upward);
+
+        if (traverse(net, traverse_mark_number)) {
+            init(destination);
+
+            if (net->num_driver_pins) {
+                metric_t **parent_stat = (metric_t **)vtr::calloc(net->num_driver_pins, sizeof(metric_t *));
+                for (int i = 0; i < net->num_driver_pins; i++)
+                    parent_stat[i] = get_upward_stat(net->driver_pins[i]->node, netlist, traverse_mark_number);
+                aggregate(destination, parent_stat, net->num_driver_pins);
+                vtr::free(parent_stat);
+            }
+        }
+    }
+    return destination;
+}
+
+static metric_t *get_upward_stat(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    metric_t *destination = NULL;
+    if (node) {
+        destination = &(node->stat.upward);
+
+        if (traverse(node, traverse_mark_number)) {
+            count_node_type(node, netlist);
+
+            init(destination);
+            if (node->num_input_pins) {
+                metric_t **parent_stat = (metric_t **)vtr::calloc(node->num_input_pins, sizeof(metric_t *));
+                for (long long i = 0; i < node->num_input_pins; i++) {
+                    if (node->input_pins[i]) {
+                        parent_stat[i] = get_upward_stat(node->input_pins[i]->net, netlist, traverse_mark_number);
+                    }
+                }
+                aggregate(destination, parent_stat, node->num_input_pins);
+                vtr::free(parent_stat);
+            }
+            add_to_stat(destination, node->num_input_pins);
+        }
+    }
+    return destination;
+}
+
+static metric_t *get_downward_stat(nnet_t *net, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    metric_t *destination = NULL;
+    if (net) {
+        destination = &(net->stat.downward);
+
+        if (traverse(net, traverse_mark_number)) {
+            init(destination);
+            if (net->num_fanout_pins) {
+                metric_t **child_stat = (metric_t **)vtr::calloc(net->num_fanout_pins, sizeof(metric_t *));
+                for (long long i = 0; i < net->num_fanout_pins; i++) {
+                    if (net->fanout_pins[i]) {
+                        child_stat[i] = get_downward_stat(net->fanout_pins[i]->node, netlist, traverse_mark_number);
+                    }
+                }
+                aggregate(destination, child_stat, net->num_fanout_pins);
+                vtr::free(child_stat);
+            }
+        }
+    }
+    return destination;
+}
+
+static metric_t *get_downward_stat(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    metric_t *destination = NULL;
+
+    if (node) {
+        destination = &(node->stat.downward);
+        if (traverse(node, traverse_mark_number)) {
+            count_node_type(node, netlist);
+
+            init(destination);
+            if (node->num_output_pins) {
+                metric_t **child_stat = (metric_t **)vtr::calloc(node->num_output_pins, sizeof(metric_t *));
+                for (long long i = 0; i < node->num_output_pins; i++) {
+                    if (node->output_pins[i]) {
+                        child_stat[i] = get_downward_stat(node->output_pins[i]->net, netlist, traverse_mark_number);
+                    }
+                }
+                aggregate(destination, child_stat, node->num_output_pins);
+                vtr::free(child_stat);
+            }
+            add_to_stat(destination, node->num_output_pins);
+        }
+    }
+    return destination;
+}
+
+static metric_t *get_upward_stat(metric_t *destination, nnode_t **node_list, long long node_count, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    if (node_list) {
+        if (node_count) {
+            metric_t **child_stat = (metric_t **)vtr::calloc(node_count, sizeof(metric_t *));
+            for (long long i = 0; i < node_count; i++) {
+                child_stat[i] = get_upward_stat(node_list[i], netlist, traverse_mark_number);
+            }
+            aggregate(destination, child_stat, node_count);
+
+            vtr::free(child_stat);
+        }
+    }
+    return destination;
+}
+
+stat_t *get_stats(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number)
+{
+    stat_t *stat = (stat_t *)vtr::malloc(sizeof(stat_t));
+    copy(&stat->downward, get_downward_stat(node, netlist, traverse_mark_number));
+    copy(&stat->upward, get_upward_stat(node, netlist, traverse_mark_number));
+    return stat;
+}
+
+static const char _travelsal_id = 0;
+static const uintptr_t travelsal_id = (uintptr_t)&_travelsal_id;
+
+/*---------------------------------------------------------------------------------------------
+ * function: dfs_to_cp() it starts from output towards input of the netlist to calculate critical path
+ *-------------------------------------------------------------------------------------------*/
+void compute_statistics(netlist_t *netlist, bool display)
+{
+    if (netlist) {
+        // reinit the node count
+        init_stat(netlist);
+
+        get_upward_stat(&netlist->output_node_stat, netlist->top_output_nodes, netlist->num_top_output_nodes, netlist, travelsal_id + 1);
+
+        if (display) {
+            log("\n\t==== Stats ====\n");
+            for (long long op = 0; op < operation_list_END; op += 1) {
+                if (netlist->num_of_type[op] > UNUSED_NODE_TYPE) {
+                    std::string hdr = std::string("Number of <") + operation_list_STR[op][ODIN_LONG_STRING] + "> node: ";
+
+                    log("%-42s%lld\n", hdr.c_str(), netlist->num_of_type[op]);
+                }
+            }
+            log("%-42s%lld\n", "Total estimated number of lut: ", netlist->num_logic_element);
+            log("%-42s%lld\n", "Total number of node: ", netlist->num_of_node);
+            log("%-42s%0.0f\n", "Longest path: ", netlist->output_node_stat.max_depth);
+            log("%-42s%0.0f\n", "Average path: ", netlist->output_node_stat.avg_depth);
+            log("\n");
+        }
+    }
+}
\ No newline at end of file
diff --git a/parmys-plugin/netlist/netlist_statistic.h b/parmys-plugin/netlist/netlist_statistic.h
new file mode 100644
index 000000000..91e7a516d
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_statistic.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _NETLIST_STATISTIC_H_
+#define _NETLIST_STATISTIC_H_
+
+#include "netlist_utils.h"
+
+static const unsigned int traversal_id = 0;
+static const uintptr_t mult_optimization_traverse_value = (uintptr_t)&traversal_id;
+
+stat_t *get_stats(nnode_t *node, netlist_t *netlist, uintptr_t traverse_mark_number);
+
+void init_stat(netlist_t *netlist);
+void compute_statistics(netlist_t *netlist, bool display);
+
+/**
+ * @brief This function will calculate and assign weights related
+ * to mixing hard and soft logic implementation for certain kind
+ * of logic blocks
+ * @param node
+ * The node that needs its weight to be assigned
+ * @param netlist
+ * netlist, has to be passed to the counting functions
+ */
+void mixing_optimization_stats(nnode_t *node, netlist_t *netlist);
+
+#endif // _NETLIST_STATISTIC_H_
diff --git a/parmys-plugin/netlist/netlist_utils.cc b/parmys-plugin/netlist/netlist_utils.cc
new file mode 100644
index 000000000..9c3fb060b
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_utils.cc
@@ -0,0 +1,1472 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "netlist_utils.h"
+#include "node_utils.h"
+#include "odin_util.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+/*---------------------------------------------------------------------------------------------
+ * (function: allocate_nnode)
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *allocate_nnode(loc_t loc)
+{
+    nnode_t *new_node = (nnode_t *)my_malloc_struct(sizeof(nnode_t));
+
+    new_node->loc = loc;
+    new_node->name = NULL;
+    new_node->type = NO_OP;
+    new_node->bit_width = 0;
+    new_node->related_ast_node = NULL;
+    new_node->traverse_visited = -1;
+
+    new_node->input_pins = NULL;
+    new_node->num_input_pins = 0;
+    new_node->output_pins = NULL;
+    new_node->num_output_pins = 0;
+
+    new_node->input_port_sizes = NULL;
+    new_node->num_input_port_sizes = 0;
+    new_node->output_port_sizes = NULL;
+    new_node->num_output_port_sizes = 0;
+
+    new_node->node_data = NULL;
+    new_node->unique_node_data_id = -1;
+
+    new_node->forward_level = -1;
+    new_node->backward_level = -1;
+    new_node->sequential_level = -1;
+    new_node->sequential_terminator = false;
+
+    //    new_node->in_queue = false;
+
+    //    new_node->undriven_pins = 0;
+    //    new_node->num_undriven_pins = 0;
+
+    //    new_node->ratio = 1;
+
+    new_node->attributes = init_attribute();
+
+    new_node->initial_value = init_value_e::undefined;
+
+    return new_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: free_nnode)
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *free_nnode(nnode_t *to_free)
+{
+    if (to_free) {
+        /* need to free node_data */
+
+        for (int i = 0; i < to_free->num_input_pins; i++) {
+            if (to_free->input_pins[i] && to_free->input_pins[i]->name) {
+                vtr::free(to_free->input_pins[i]->name);
+                to_free->input_pins[i]->name = NULL;
+            }
+            to_free->input_pins[i] = (npin_t *)vtr::free(to_free->input_pins[i]);
+        }
+
+        to_free->input_pins = (npin_t **)vtr::free(to_free->input_pins);
+
+        for (int i = 0; i < to_free->num_output_pins; i++) {
+            if (to_free->output_pins[i] && to_free->output_pins[i]->name) {
+                vtr::free(to_free->output_pins[i]->name);
+                to_free->output_pins[i]->name = NULL;
+            }
+            to_free->output_pins[i] = (npin_t *)vtr::free(to_free->output_pins[i]);
+        }
+
+        to_free->output_pins = (npin_t **)vtr::free(to_free->output_pins);
+
+        vtr::free(to_free->input_port_sizes);
+        vtr::free(to_free->output_port_sizes);
+        //        vtr::free(to_free->undriven_pins);
+
+        free_attribute(to_free->attributes);
+
+        if (to_free->name) {
+            vtr::free(to_free->name);
+            to_free->name = NULL;
+        }
+
+        /* now free the node */
+    }
+    return (nnode_t *)vtr::free(to_free);
+}
+
+/*-------------------------------------------------------------------------
+ * (function: allocate_more_node_input_pins)
+ * 	Makes more space in the node for pin connections ...
+ *-----------------------------------------------------------------------*/
+void allocate_more_input_pins(nnode_t *node, int width)
+{
+    if (width <= 0) {
+        error_message(NETLIST, node->loc, "tried adding input pins for width %d <= 0 %s\n", width, node->name);
+        return;
+    }
+
+    node->input_pins = (npin_t **)vtr::realloc(node->input_pins, sizeof(npin_t *) * (node->num_input_pins + width));
+    for (int i = 0; i < width; i++) {
+        node->input_pins[node->num_input_pins + i] = NULL;
+    }
+    node->num_input_pins += width;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: allocate_more_node_output_pins)
+ * 	Makes more space in the node for pin connections ...
+ *-----------------------------------------------------------------------*/
+void allocate_more_output_pins(nnode_t *node, int width)
+{
+    if (width <= 0) {
+        error_message(NETLIST, node->loc, "tried adding output pins for width %d <= 0 %s\n", width, node->name);
+        return;
+    }
+
+    node->output_pins = (npin_t **)vtr::realloc(node->output_pins, sizeof(npin_t *) * (node->num_output_pins + width));
+    for (int i = 0; i < width; i++) {
+        node->output_pins[node->num_output_pins + i] = NULL;
+    }
+    node->num_output_pins += width;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_output_port_information)
+ *-------------------------------------------------------------------------------------------*/
+void add_output_port_information(nnode_t *node, int port_width)
+{
+    node->output_port_sizes = (int *)vtr::realloc(node->output_port_sizes, sizeof(int) * (node->num_output_port_sizes + 1));
+    node->output_port_sizes[node->num_output_port_sizes] = port_width;
+    node->num_output_port_sizes++;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_input_port_information)
+ *-------------------------------------------------------------------------------------------*/
+void add_input_port_information(nnode_t *node, int port_width)
+{
+    node->input_port_sizes = (int *)vtr::realloc(node->input_port_sizes, sizeof(int) * (node->num_input_port_sizes + 1));
+    node->input_port_sizes[node->num_input_port_sizes] = port_width;
+    node->num_input_port_sizes++;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: allocate_npin)
+ *-------------------------------------------------------------------------------------------*/
+npin_t *allocate_npin()
+{
+    npin_t *new_pin;
+
+    new_pin = (npin_t *)my_malloc_struct(sizeof(npin_t));
+
+    new_pin->name = NULL;
+    new_pin->type = NO_ID;
+    new_pin->net = NULL;
+    new_pin->pin_net_idx = -1;
+    new_pin->node = NULL;
+    new_pin->pin_node_idx = -1;
+    new_pin->mapping = NULL;
+
+    // new_pin->values = NULL;
+
+    new_pin->coverage = 0;
+
+    new_pin->is_default = false;
+    new_pin->is_implied = false;
+
+    return new_pin;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: free_npin)
+ *-------------------------------------------------------------------------------------------*/
+npin_t *free_npin(npin_t *to_free)
+{
+    if (to_free) {
+        if (to_free->name)
+            vtr::free(to_free->name);
+
+        to_free->name = NULL;
+
+        if (to_free->mapping)
+            vtr::free(to_free->mapping);
+
+        to_free->mapping = NULL;
+
+        /* now free the pin */
+    }
+    return (npin_t *)vtr::free(to_free);
+}
+
+/*-------------------------------------------------------------------------
+ * (function: copy_npin)
+ * 	Copies a pin
+ *  Should only be called by the parent functions,
+ *  copy_input_npin & copy_output_npin
+ *-----------------------------------------------------------------------*/
+static npin_t *copy_npin(npin_t *copy_pin)
+{
+    npin_t *new_pin = allocate_npin();
+    new_pin->name = copy_pin->name ? vtr::strdup(copy_pin->name) : NULL;
+    new_pin->type = copy_pin->type;
+    new_pin->mapping = copy_pin->mapping ? vtr::strdup(copy_pin->mapping) : NULL;
+    new_pin->is_default = copy_pin->is_default;
+    new_pin->sensitivity = copy_pin->sensitivity;
+
+    return new_pin;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: copy_output_npin)
+ * 	Copies an output pin
+ *-----------------------------------------------------------------------*/
+npin_t *copy_output_npin(npin_t *copy_pin)
+{
+    npin_t *new_pin = copy_npin(copy_pin);
+    oassert(copy_pin->type == OUTPUT);
+    new_pin->net = copy_pin->net;
+
+    return new_pin;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: copy_input_npin)
+ * 	Copies an input pin and potentially adds to the net
+ *-----------------------------------------------------------------------*/
+npin_t *copy_input_npin(npin_t *copy_pin)
+{
+    npin_t *new_pin = copy_npin(copy_pin);
+    oassert(copy_pin->type == INPUT);
+    if (copy_pin->net != NULL) {
+        add_fanout_pin_to_net(copy_pin->net, new_pin);
+    }
+
+    return new_pin;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: allocate_nnet)
+ *-------------------------------------------------------------------------------------------*/
+nnet_t *allocate_nnet()
+{
+    nnet_t *new_net = (nnet_t *)my_malloc_struct(sizeof(nnet_t));
+
+    new_net->name = NULL;
+    new_net->driver_pins = NULL;
+    new_net->num_driver_pins = 0;
+    new_net->fanout_pins = NULL;
+    new_net->num_fanout_pins = 0;
+    new_net->combined = false;
+
+    new_net->net_data = NULL;
+    new_net->unique_net_data_id = -1;
+
+    return new_net;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: free_nnet)
+ *-------------------------------------------------------------------------------------------*/
+nnet_t *free_nnet(nnet_t *to_free)
+{
+    if (to_free) {
+        to_free->fanout_pins = (npin_t **)vtr::free(to_free->fanout_pins);
+
+        if (to_free->name)
+            vtr::free(to_free->name);
+
+        if (to_free->num_driver_pins)
+            vtr::free(to_free->driver_pins);
+
+        /* now free the net */
+    }
+    return (nnet_t *)vtr::free(to_free);
+}
+
+/*---------------------------------------------------------------------------
+ * (function: move_a_output_pin)
+ *-------------------------------------------------------------------------*/
+void move_output_pin(nnode_t *node, int old_idx, int new_idx)
+{
+    npin_t *pin;
+
+    oassert(node != NULL);
+    oassert(((old_idx >= 0) && (old_idx < node->num_output_pins)));
+    oassert(((new_idx >= 0) && (new_idx < node->num_output_pins)));
+    /* assumes the pin spots have been allocated and the pin */
+    pin = node->output_pins[old_idx];
+    node->output_pins[new_idx] = node->output_pins[old_idx];
+    node->output_pins[old_idx] = NULL;
+    /* record the node and pin spot in the pin */
+    pin->type = OUTPUT;
+    pin->node = node;
+    pin->pin_node_idx = new_idx;
+}
+
+/*---------------------------------------------------------------------------
+ * (function: move_a_input_pin)
+ *-------------------------------------------------------------------------*/
+void move_input_pin(nnode_t *node, int old_idx, int new_idx)
+{
+    npin_t *pin;
+
+    oassert(node != NULL);
+    oassert(((old_idx >= 0) && (old_idx < node->num_input_pins)));
+    oassert(((new_idx >= 0) && (new_idx < node->num_input_pins)));
+    /* assumes the pin spots have been allocated and the pin */
+    node->input_pins[new_idx] = node->input_pins[old_idx];
+    pin = node->input_pins[old_idx];
+    node->input_pins[old_idx] = NULL;
+    /* record the node and pin spot in the pin */
+    pin->type = INPUT;
+    pin->node = node;
+    pin->pin_node_idx = new_idx;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_a_input_pin_to_node_spot_idx)
+ *-------------------------------------------------------------------------------------------*/
+void add_input_pin_to_node(nnode_t *node, npin_t *pin, int pin_idx)
+{
+    oassert(node != NULL);
+    oassert(pin != NULL);
+    oassert(pin_idx < node->num_input_pins);
+    /* assumes the pin spots have been allocated and the pin */
+    node->input_pins[pin_idx] = pin;
+    /* record the node and pin spot in the pin */
+    pin->type = INPUT;
+    pin->node = node;
+    pin->pin_node_idx = pin_idx;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_a_input_pin_to_spot_idx)
+ *-------------------------------------------------------------------------------------------*/
+void add_fanout_pin_to_net(nnet_t *net, npin_t *pin)
+{
+    oassert(net != NULL);
+    oassert(pin != NULL);
+    oassert(pin->type != OUTPUT);
+    /* assumes the pin spots have been allocated and the pin */
+    net->fanout_pins = (npin_t **)vtr::realloc(net->fanout_pins, sizeof(npin_t *) * (net->num_fanout_pins + 1));
+    net->fanout_pins[net->num_fanout_pins] = pin;
+    net->num_fanout_pins++;
+    /* record the node and pin spot in the pin */
+    pin->net = net;
+    pin->pin_net_idx = net->num_fanout_pins - 1;
+    pin->type = INPUT;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_a_output_pin_to_node_spot_idx)
+ *-------------------------------------------------------------------------------------------*/
+void add_output_pin_to_node(nnode_t *node, npin_t *pin, int pin_idx)
+{
+    oassert(node != NULL);
+    oassert(pin != NULL);
+    oassert(pin_idx < node->num_output_pins);
+    /* assumes the pin spots have been allocated and the pin */
+    node->output_pins[pin_idx] = pin;
+    /* record the node and pin spot in the pin */
+    pin->type = OUTPUT;
+    pin->node = node;
+    pin->pin_node_idx = pin_idx;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_a_output_pin_to_spot_idx)
+ *-------------------------------------------------------------------------------------------*/
+void add_driver_pin_to_net(nnet_t *net, npin_t *pin)
+{
+    oassert(net != NULL);
+    oassert(pin != NULL);
+    oassert(pin->type != INPUT);
+    /* assumes the pin spots have been allocated and the pin */
+    net->num_driver_pins++;
+    net->driver_pins = (npin_t **)vtr::realloc(net->driver_pins, net->num_driver_pins * sizeof(npin_t *));
+    net->driver_pins[net->num_driver_pins - 1] = pin;
+    /* record the node and pin spot in the pin */
+    pin->net = net;
+    pin->type = OUTPUT;
+    pin->pin_net_idx = net->num_driver_pins - 1;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: join_nets)
+ * 	Copies the fanouts from input net into net
+ * TODO: improve error message
+ *-------------------------------------------------------------------------------------------*/
+void join_nets(nnet_t *join_to_net, nnet_t *other_net)
+{
+    if (join_to_net == other_net) {
+        for (int i = 0; i < join_to_net->num_driver_pins; i++) {
+            const char *pin_name = join_to_net->driver_pins[i]->name ? join_to_net->driver_pins[i]->name : "unknown";
+            if ((join_to_net->driver_pins[i]->node != NULL))
+                warning_message(NETLIST, join_to_net->driver_pins[i]->node->loc, "%s %s\n", "Combinational loop with driver pin", pin_name);
+            else
+                warning_message(NETLIST, unknown_location, "%s %s\n", "Combinational loop with driver pin", pin_name);
+        }
+        for (int i = 0; i < join_to_net->num_fanout_pins; i++) {
+            const char *pin_name = join_to_net->fanout_pins[i]->name ? join_to_net->fanout_pins[i]->name : "unknown";
+            if ((join_to_net->fanout_pins[i] != NULL) && (join_to_net->fanout_pins[i]->node != NULL))
+                warning_message(NETLIST, join_to_net->fanout_pins[i]->node->loc, "%s %s\n", "Combinational loop with fanout pin", pin_name);
+            else
+                warning_message(NETLIST, unknown_location, "%s %s\n", "Combinational loop with fanout pin", pin_name);
+        }
+
+        error_message(NETLIST, unknown_location, "%s", "Found a combinational loop");
+    } else if (other_net->num_driver_pins > 1) {
+        if (other_net->name && join_to_net->name)
+            error_message(NETLIST, unknown_location, "Tried to join net %s to %s but this would lose %d drivers for net %s", other_net->name,
+                          join_to_net->name, other_net->num_driver_pins - 1, other_net->name);
+        else
+            error_message(NETLIST, unknown_location, "Tried to join nets but this would lose %d drivers", other_net->num_driver_pins - 1);
+    }
+
+    /* copy the driver over to the new_net */
+    for (int i = 0; i < other_net->num_fanout_pins; i++) {
+        if (other_net->fanout_pins[i]) {
+            add_fanout_pin_to_net(join_to_net, other_net->fanout_pins[i]);
+        }
+    }
+
+    // CLEAN UP
+    free_nnet(other_net);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: remap_pin_to_new_net)
+ *-------------------------------------------------------------------------------------------*/
+void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net)
+{
+    if (pin->type == INPUT) {
+        /* clean out the entry in the old net */
+        pin->net->fanout_pins[pin->pin_net_idx] = NULL;
+        /* do the new addition */
+        add_fanout_pin_to_net(new_net, pin);
+    } else if (pin->type == OUTPUT) {
+        /* clean out the entry in the old net */
+        if (pin->net->num_driver_pins)
+            vtr::free(pin->net->driver_pins);
+        pin->net->num_driver_pins = 0;
+        pin->net->driver_pins = NULL;
+        /* do the new addition */
+        add_driver_pin_to_net(new_net, pin);
+    }
+}
+
+/*-------------------------------------------------------------------------
+ * (function: remap_pin_to_new_node)
+ *-----------------------------------------------------------------------*/
+void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx)
+{
+    if (pin->type == INPUT) {
+        /* clean out the entry in the old net */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+        /* do the new addition */
+        add_input_pin_to_node(new_node, pin, pin_idx);
+    } else if (pin->type == OUTPUT) {
+        /* clean out the entry in the old net */
+        pin->node->output_pins[pin->pin_node_idx] = NULL;
+        /* do the new addition */
+        add_output_pin_to_node(new_node, pin, pin_idx);
+    }
+}
+
+/*------------------------------------------------------------------------
+ * (function: connect_nodes)
+ * 	Connect one output node to the inputs of the input node
+ *----------------------------------------------------------------------*/
+void connect_nodes(nnode_t *out_node, int out_idx, nnode_t *in_node, int in_idx)
+{
+    npin_t *new_in_pin;
+
+    oassert(out_node->num_output_pins > out_idx);
+    oassert(in_node->num_input_pins > in_idx);
+
+    new_in_pin = allocate_npin();
+
+    /* create the pin that hooks up to the input */
+    add_input_pin_to_node(in_node, new_in_pin, in_idx);
+
+    if (out_node->output_pins[out_idx] == NULL) {
+        /* IF - this node has no output net or pin */
+        npin_t *new_out_pin;
+        nnet_t *new_net;
+        new_net = allocate_nnet();
+        new_out_pin = allocate_npin();
+
+        new_net->name = vtr::strdup(out_node->name);
+        /* create the pin that hooks up to the input */
+        add_output_pin_to_node(out_node, new_out_pin, out_idx);
+        /* hook up in pin out of the new net */
+        add_fanout_pin_to_net(new_net, new_in_pin);
+        /* hook up the new pin 2 to this new net */
+        add_driver_pin_to_net(new_net, new_out_pin);
+    } else {
+        /* ELSE - there is a net so we just add a fanout */
+        /* hook up in pin out of the new net */
+        add_fanout_pin_to_net(out_node->output_pins[out_idx]->net, new_in_pin);
+    }
+}
+
+/**
+ * -------------------------------------------------------------------------------------------
+ * (function: init_attribute_structure)
+ *
+ * @brief Initializes the netlist node attributes
+ * including edge sensitivies and reset value
+ *-------------------------------------------------------------------------------------------*/
+attr_t *init_attribute()
+{
+    attr_t *attribute;
+    attribute = (attr_t *)vtr::malloc(sizeof(attr_t));
+
+    attribute->clk_edge_type = UNDEFINED_SENSITIVITY;
+    attribute->clr_polarity = UNDEFINED_SENSITIVITY;
+    attribute->set_polarity = UNDEFINED_SENSITIVITY;
+    attribute->areset_polarity = UNDEFINED_SENSITIVITY;
+    attribute->sreset_polarity = UNDEFINED_SENSITIVITY;
+    attribute->enable_polarity = UNDEFINED_SENSITIVITY;
+
+    attribute->areset_value = 0;
+    attribute->sreset_value = 0;
+
+    attribute->port_a_signed = UNSIGNED;
+    attribute->port_b_signed = UNSIGNED;
+
+    /* memory node attributes */
+    attribute->size = 0;
+    attribute->offset = 0;
+    attribute->memory_id = NULL;
+
+    attribute->RD_CLK_ENABLE = UNDEFINED_SENSITIVITY;
+    attribute->WR_CLK_ENABLE = UNDEFINED_SENSITIVITY;
+    attribute->RD_CLK_POLARITY = UNDEFINED_SENSITIVITY;
+    attribute->WR_CLK_POLARITY = UNDEFINED_SENSITIVITY;
+
+    attribute->RD_PORTS = 0;
+    attribute->WR_PORTS = 0;
+    attribute->DBITS = 0;
+    attribute->ABITS = 0;
+
+    return (attribute);
+}
+
+/**
+ * -------------------------------------------------------------------------------------------
+ * (function: copy_attribute)
+ *
+ * @brief copy an attribute to another one. If the second
+ * attribute is null, it will creates one
+ *
+ * @param to will copy to this attr
+ * @param copy the attr that will be copied
+ *-------------------------------------------------------------------------------------------*/
+void copy_attribute(attr_t *to, attr_t *copy)
+{
+    if (to == NULL)
+        to = init_attribute();
+
+    to->clk_edge_type = copy->clk_edge_type;
+    to->clr_polarity = copy->clr_polarity;
+    to->set_polarity = copy->set_polarity;
+    to->areset_polarity = copy->areset_polarity;
+    to->sreset_polarity = copy->sreset_polarity;
+    to->enable_polarity = copy->enable_polarity;
+
+    to->areset_value = copy->areset_value;
+    to->sreset_value = copy->sreset_value;
+
+    to->port_a_signed = copy->port_a_signed;
+    to->port_b_signed = copy->port_b_signed;
+
+    /* memory node attributes */
+    to->size = copy->size;
+    to->offset = copy->offset;
+    to->memory_id = vtr::strdup(copy->memory_id);
+
+    to->RD_CLK_ENABLE = copy->RD_CLK_ENABLE;
+    to->WR_CLK_ENABLE = copy->WR_CLK_ENABLE;
+    to->RD_CLK_POLARITY = copy->RD_CLK_POLARITY;
+    to->WR_CLK_POLARITY = copy->WR_CLK_POLARITY;
+
+    to->RD_PORTS = copy->RD_PORTS;
+    to->WR_PORTS = copy->WR_PORTS;
+    to->DBITS = copy->DBITS;
+    to->ABITS = copy->ABITS;
+}
+
+/**
+ * -------------------------------------------------------------------------------------------
+ * (function: copy_signedness)
+ *
+ * @brief copy the signedness variables of an attribute to another
+ * one. If the second attribute is null, it will creates one
+ *
+ * @param to will copy to this attr
+ * @param copy the attr that will be copied
+ *-------------------------------------------------------------------------------------------*/
+void copy_signedness(attr_t *to, attr_t *copy)
+{
+    if (to == NULL)
+        to = init_attribute();
+
+    to->port_a_signed = copy->port_a_signed;
+    to->port_b_signed = copy->port_b_signed;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: init_signal_list_structure)
+ * 	Initializes the list structure which describes inputs and outputs of elements
+ * 	as they coneect to other elements in the graph.
+ *-------------------------------------------------------------------------------------------*/
+signal_list_t *init_signal_list()
+{
+    signal_list_t *list;
+    list = (signal_list_t *)vtr::malloc(sizeof(signal_list_t));
+
+    list->count = 0;
+    list->pins = NULL;
+    list->is_memory = false;
+    list->is_adder = false;
+
+    return list;
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: is_constant_signal)
+ *
+ * @brief showing that a given signal list has a constant value or not
+ *
+ * @param signal list of pins
+ * @param netlist pointer to the current netlist file
+ *
+ * @return is it constant or not
+ *-------------------------------------------------------------------------------------------*/
+bool is_constant_signal(signal_list_t *signal, netlist_t *netlist)
+{
+    bool is_constant = true;
+
+    for (int i = 0; i < signal->count; i++) {
+        nnet_t *net = signal->pins[i]->net;
+        /* neither connected to GND nor VCC */
+        if (strcmp(net->name, netlist->zero_net->name) && strcmp(net->name, netlist->one_net->name)) {
+            is_constant = false;
+            break;
+        }
+    }
+
+    return (is_constant);
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: constant_signal_value)
+ *
+ * @brief calculating the value of a constant signal list
+ *
+ * @param signal list of pins
+ * @param netlist pointer to the current netlist file
+ *
+ * @return the integer value of the constant signal
+ *-------------------------------------------------------------------------------------------*/
+long constant_signal_value(signal_list_t *signal, netlist_t *netlist)
+{
+    oassert(is_constant_signal(signal, netlist));
+
+    long return_value = 0;
+
+    for (int i = 0; i < signal->count; i++) {
+        nnet_t *net = signal->pins[i]->net;
+        /* if the pin is connected to VCC */
+        if (!strcmp(net->name, netlist->one_net->name)) {
+            return_value += shift_left_value_with_overflow_check(0X1, i, unknown_location);
+        }
+    }
+
+    return (return_value);
+}
+
+/**
+ *---------------------------------------------------------------------------------------------
+ * (function: create_constant_signal)
+ *
+ * @brief create the signal_list of the given constant value
+ *
+ * @param value a long value
+ * @param desired_width the size of return signal list
+ * @param netlist pointer to the netlist
+ *-------------------------------------------------------------------------------------------*/
+signal_list_t *create_constant_signal(const long long value, const int desired_width, netlist_t *netlist)
+{
+    signal_list_t *list = init_signal_list();
+
+    std::string binary_value_str = string_of_radix_to_bitstring(std::to_string(value), 10);
+    long width = binary_value_str.length();
+
+    while (desired_width > width) {
+        if (value < 0)
+            binary_value_str = "1" + binary_value_str;
+        else
+            binary_value_str = "0" + binary_value_str;
+
+        width = binary_value_str.length();
+    }
+
+    long start = width;
+    long end = width - desired_width;
+    bool extension = false;
+
+    /* create vcc/gnd signal pins */
+    for (long i = start; i > end; i--) {
+        if (binary_value_str[i - 1] == '1') {
+            add_pin_to_signal_list(list, get_one_pin(netlist));
+        } else {
+            add_pin_to_signal_list(list, (extension) ? get_pad_pin(netlist) : get_zero_pin(netlist));
+        }
+    }
+
+    return (list);
+}
+
+/**
+ * (function: prune_signal)
+ *
+ * @brief to prune extra pins. usually this happens when a memory
+ * address (less than 32 bits) comes from an arithmetic operation
+ * that makes it 32 bits (a lot useless pins).
+ *
+ * @param signalsvar signal list of pins (may include one signal or two signals)
+ * @param signal_width the width of each signal in signalsvar
+ * @param prune_size the desired size of signal list
+ * @param num_of_signals showing the number of signals the signalsvar containing
+ *
+ * @return pruned signal list
+ */
+signal_list_t *prune_signal(signal_list_t *signalsvar, long signal_width, long prune_size, int num_of_signals)
+{
+    /* validation */
+    oassert(prune_size);
+    oassert(num_of_signals);
+    oassert(signalsvar->count % signal_width == 0);
+
+    /* no need to prune */
+    if (prune_size >= signal_width)
+        return (signalsvar);
+
+    /* new signal list */
+    signal_list_t *new_signals = NULL;
+    signal_list_t **splitted_signals = split_signal_list(signalsvar, signal_width);
+
+    /* iterating over signals to prune them */
+    for (int i = 0; i < num_of_signals; i++) {
+        /* init pruned signal list */
+        signal_list_t *new_signal = init_signal_list();
+        for (int j = 0; j < signal_width; j++) {
+            npin_t *pin = splitted_signals[i]->pins[j];
+            /* adding pin to new signal list */
+            if (j < prune_size) {
+                add_pin_to_signal_list(new_signal, pin);
+            }
+            /* pruning the extra pins */
+            else {
+                /* detach from the node, its net and free pin */
+                pin->node->input_pins[pin->pin_node_idx] = NULL;
+                pin->node = NULL;
+                warning_message(NETLIST, unknown_location, "Input pin (%s) exceeds the size of its connected port, will be left unconnected",
+                                pin->net->name);
+            }
+        }
+
+        free_signal_list(splitted_signals[i]);
+        splitted_signals[i] = new_signal;
+    }
+
+    /* combining pruned signals */
+    new_signals = combine_lists(splitted_signals, num_of_signals);
+
+    // CLEAN UP
+    vtr::free(splitted_signals);
+
+    return (new_signals);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: add_inpin_to_signal_list)
+ * 	Stores a pin in the signal list
+ *-------------------------------------------------------------------------------------------*/
+void add_pin_to_signal_list(signal_list_t *list, npin_t *pin)
+{
+    list->pins = (npin_t **)vtr::realloc(list->pins, sizeof(npin_t *) * (list->count + 1));
+    list->pins[list->count] = pin;
+    list->count++;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: combine_lists)
+ *-------------------------------------------------------------------------------------------*/
+signal_list_t *combine_lists(signal_list_t **signal_lists, int num_signal_lists)
+{
+    for (int i = 1; i < num_signal_lists; i++) {
+        if (signal_lists[i]) {
+            for (int j = 0; j < signal_lists[i]->count; j++) {
+                bool pin_already_added = false;
+                for (int k = 0; k < signal_lists[0]->count; k++) {
+                    if (!strcmp(signal_lists[0]->pins[k]->name, signal_lists[i]->pins[j]->name))
+                        pin_already_added = true;
+                }
+
+                if (!pin_already_added)
+                    add_pin_to_signal_list(signal_lists[0], signal_lists[i]->pins[j]);
+            }
+
+            free_signal_list(signal_lists[i]);
+        }
+    }
+
+    return signal_lists[0];
+}
+
+/**
+ * (function: split_list)
+ *
+ * @brief split signals list to a list of signal list with requested width
+ *
+ * @param signalsvar signal list of pins (may include one signal or two signals)
+ * @param width the width of each signal in signalsvar
+ *
+ * @return splitted signal list
+ */
+signal_list_t **split_signal_list(signal_list_t *signalsvar, const int width)
+{
+    signal_list_t **splitted_signals = NULL;
+
+    /* check if split is needed */
+    if (signalsvar->count == width) {
+        splitted_signals = (signal_list_t **)vtr::calloc(1, sizeof(signal_list_t *));
+        splitted_signals[0] = signalsvar;
+        return (splitted_signals);
+    }
+
+    /* validate signals list size */
+    oassert(width != 0);
+    oassert(signalsvar->count % width == 0);
+
+    int offset = 0;
+    int num_chunk = signalsvar->count / width;
+
+    /* initialize splitted signals */
+    splitted_signals = (signal_list_t **)vtr::calloc(num_chunk, sizeof(signal_list_t *));
+    for (int i = 0; i < num_chunk; i++) {
+        splitted_signals[i] = init_signal_list();
+        for (int j = 0; j < width; j++) {
+            npin_t *pin = signalsvar->pins[j + offset];
+            /* add to splitted signals list */
+            add_pin_to_signal_list(splitted_signals[i], pin);
+        }
+        offset += width;
+    }
+
+    return (splitted_signals);
+}
+
+/**
+ * (function: sigcmp)
+ *
+ * @brief to check if sig is the same as be_checked
+ *
+ * @param sig first signals
+ * @param be_checked second signals
+ */
+bool sigcmp(signal_list_t *sig, signal_list_t *be_checked)
+{
+    /* validate signal sizes */
+    oassert(sig->count == be_checked->count);
+
+    for (int i = 0; i < sig->count; i++) {
+        /* checking their net */
+        if (sig->pins[i]->net != be_checked->pins[i]->net) {
+            return (false);
+        }
+    }
+    return (true);
+}
+
+signal_list_t *copy_input_signals(signal_list_t *signalsvar)
+{
+    signal_list_t *duplicate_signals = init_signal_list();
+    for (int i = 0; i < signalsvar->count; i++) {
+        npin_t *pin = signalsvar->pins[i];
+        pin = copy_input_npin(pin);
+        add_pin_to_signal_list(duplicate_signals, pin);
+    }
+    return duplicate_signals;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_output_pins_for_existing_node)
+ * 	Looks at a node and extracts the output pins into a signal list so they can be accessed
+ * 	in this form
+ *-------------------------------------------------------------------------------------------*/
+signal_list_t *make_output_pins_for_existing_node(nnode_t *node, int width)
+{
+    signal_list_t *return_list = init_signal_list();
+
+    oassert(node->num_output_pins == width);
+
+    for (int i = 0; i < width; i++) {
+        npin_t *new_pin1;
+        npin_t *new_pin2;
+        nnet_t *new_net;
+        new_pin1 = allocate_npin();
+        new_pin2 = allocate_npin();
+        new_net = allocate_nnet();
+        new_net->name = node->name;
+        /* hook the output pin into the node */
+        add_output_pin_to_node(node, new_pin1, i);
+        /* hook up new pin 1 into the new net */
+        add_driver_pin_to_net(new_net, new_pin1);
+        /* hook up the new pin 2 to this new net */
+        add_fanout_pin_to_net(new_net, new_pin2);
+
+        /* add the new_pin2 to the list of pins */
+        add_pin_to_signal_list(return_list, new_pin2);
+    }
+
+    return return_list;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: clean_signal_list_structure)
+ *-------------------------------------------------------------------------------------------*/
+void free_signal_list(signal_list_t *list)
+{
+    if (list) {
+        vtr::free(list->pins);
+        list->count = 0;
+    }
+    vtr::free(list);
+    list = NULL;
+}
+
+/**
+ * -------------------------------------------------------------------------------------------
+ * (function: free_attribute)
+ *
+ * @brief clean the given attribute structure to avoid mem leaks
+ *
+ * @param attribute the given attribute structure
+ *-------------------------------------------------------------------------------------------*/
+void free_attribute(attr_t *attribute)
+{
+    if (attribute) {
+        vtr::free(attribute->memory_id);
+        attribute->memory_id = NULL;
+    }
+
+    vtr::free(attribute);
+    attribute = NULL;
+}
+
+void depth_traverse_count(nnode_t *node, int *count, uintptr_t traverse_mark_number);
+
+/*---------------------------------------------------------------------------------------------
+ * (function: depth_first_traverse)
+ *-------------------------------------------------------------------------------------------*/
+void depth_traverse_count(nnode_t *node, int *count, uintptr_t traverse_mark_number)
+{
+    nnode_t *next_node;
+    nnet_t *next_net;
+
+    if (node->traverse_visited == traverse_mark_number) {
+        return;
+    } else {
+        /* ELSE - this is a new node so depth visit it */
+        (*count)++;
+
+        node->traverse_visited = traverse_mark_number;
+
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->net == NULL)
+                continue;
+
+            next_net = node->output_pins[i]->net;
+            for (int j = 0; j < next_net->num_fanout_pins; j++) {
+                if (next_net->fanout_pins[j] == NULL)
+                    continue;
+                next_node = next_net->fanout_pins[j]->node;
+                if (next_node == NULL)
+                    continue;
+
+                depth_traverse_count(next_node, count, traverse_mark_number);
+            }
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function:  allocate_netlist)
+ *-------------------------------------------------------------------------------------------*/
+netlist_t *allocate_netlist()
+{
+    netlist_t *new_netlist;
+
+    new_netlist = (netlist_t *)my_malloc_struct(sizeof(netlist_t));
+
+    new_netlist->gnd_node = NULL;
+    new_netlist->vcc_node = NULL;
+    new_netlist->pad_node = NULL;
+    new_netlist->zero_net = NULL;
+    new_netlist->one_net = NULL;
+    new_netlist->pad_net = NULL;
+    new_netlist->top_input_nodes = NULL;
+    new_netlist->num_top_input_nodes = 0;
+    new_netlist->top_output_nodes = NULL;
+    new_netlist->num_top_output_nodes = 0;
+    new_netlist->ff_nodes = NULL;
+    new_netlist->num_ff_nodes = 0;
+    new_netlist->internal_nodes = NULL;
+    new_netlist->num_internal_nodes = 0;
+    new_netlist->clocks = NULL;
+    new_netlist->num_clocks = 0;
+
+    new_netlist->forward_levels = NULL;
+    new_netlist->num_forward_levels = 0;
+    new_netlist->num_at_forward_level = NULL;
+    new_netlist->backward_levels = NULL;
+    new_netlist->num_backward_levels = 0;
+    new_netlist->num_at_backward_level = NULL;
+    new_netlist->sequential_level_nodes = NULL;
+    new_netlist->num_sequential_levels = 0;
+    new_netlist->num_at_sequential_level = NULL;
+    new_netlist->sequential_level_combinational_termination_node = NULL;
+    new_netlist->num_sequential_level_combinational_termination_nodes = 0;
+    new_netlist->num_at_sequential_level_combinational_termination_node = NULL;
+
+    /* initialize the string chaches */
+    new_netlist->nets_sc = sc_new_string_cache();
+    new_netlist->out_pins_sc = sc_new_string_cache();
+    new_netlist->nodes_sc = sc_new_string_cache();
+
+    return new_netlist;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function:  free_netlist)
+ *-------------------------------------------------------------------------------------------*/
+void free_netlist(netlist_t *to_free)
+{
+    if (!to_free)
+        return;
+
+    sc_free_string_cache(to_free->nets_sc);
+    sc_free_string_cache(to_free->out_pins_sc);
+    sc_free_string_cache(to_free->nodes_sc);
+}
+
+/*
+ * Gets the index of the first output pin with the given mapping
+ * on the given node.
+ */
+int get_output_pin_index_from_mapping(nnode_t *node, const char *name)
+{
+    for (int i = 0; i < node->num_output_pins; i++) {
+        npin_t *pin = node->output_pins[i];
+        if (!strcmp(pin->mapping, name))
+            return i;
+    }
+
+    return -1;
+}
+
+/*
+ * Gets the index of the first output port containing a pin with the given
+ * mapping.
+ */
+int get_output_port_index_from_mapping(nnode_t *node, const char *name)
+{
+    int pin_number = 0;
+    for (int i = 0; i < node->num_output_port_sizes; i++) {
+        for (int j = 0; j < node->output_port_sizes[i]; j++, pin_number++) {
+            npin_t *pin = node->output_pins[pin_number];
+            if (!strcmp(pin->mapping, name))
+                return i;
+        }
+    }
+    return -1;
+}
+
+/*
+ * Gets the index of the first pin with the given mapping.
+ */
+int get_input_pin_index_from_mapping(nnode_t *node, const char *name)
+{
+    for (int i = 0; i < node->num_input_pins; i++) {
+        npin_t *pin = node->input_pins[i];
+        if (!strcmp(pin->mapping, name))
+            return i;
+    }
+
+    return -1;
+}
+
+/*
+ * Gets the port index of the first port containing a pin with
+ * the given mapping.
+ */
+int get_input_port_index_from_mapping(nnode_t *node, const char *name)
+{
+    int pin_number = 0;
+    for (int i = 0; i < node->num_input_port_sizes; i++) {
+        for (int j = 0; j < node->input_port_sizes[i]; j++, pin_number++) {
+            npin_t *pin = node->input_pins[pin_number];
+            if (!strcmp(pin->mapping, name))
+                return i;
+        }
+    }
+    return -1;
+}
+
+/**
+ * (function: legalize_polarity)
+ *
+ * @brief legalize pin polarity to RE
+ *
+ * @param pin first pin
+ * @param pin_polarity first pin polarity
+ * @param node pointer to pins node for tracking purpose
+ *
+ * @return a new pin with RISING_EDGE_SENSITIVITY polarity
+ */
+npin_t *legalize_polarity(npin_t *pin, edge_type_e pin_polarity, nnode_t *node)
+{
+    /* validate pins */
+    oassert(pin && node);
+    oassert(pin->type == INPUT);
+
+    /* pin and its polarity */
+    npin_t *pin_out = pin;
+
+    /* detach pin from its node */
+    if (pin->node)
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+    if (pin_polarity == FALLING_EDGE_SENSITIVITY || pin_polarity == ACTIVE_LOW_SENSITIVITY) {
+        /* create a not gate */
+        nnode_t *not_node = make_1port_gate(LOGICAL_NOT, 1, 1, node, node->traverse_visited);
+        /* hook the pin into not node */
+        add_input_pin_to_node(not_node, pin, 0);
+        /* create output pins */
+        pin_out = allocate_npin();
+        npin_t *not_out = allocate_npin();
+        nnet_t *not_out_net = allocate_nnet();
+        not_out_net->name = make_full_ref_name(NULL, NULL, NULL, not_node->name, 0);
+        /* hook the output pin into the node */
+        add_output_pin_to_node(not_node, not_out, 0);
+        /* hook up new pin 1 into the new net */
+        add_driver_pin_to_net(not_out_net, not_out);
+        /* hook up the new pin 2 to this new net */
+        add_fanout_pin_to_net(not_out_net, pin_out);
+    }
+
+    /* set new pin polarity */
+    pin_out->sensitivity = RISING_EDGE_SENSITIVITY;
+
+    return (pin_out);
+}
+
+/**
+ * (function: reduce_input_ports)
+ *
+ * @brief reduce the input ports size by removing extra pad pins
+ *
+ * @param node pointer to node
+ * @param netlist pointer to the current netlist
+ *
+ * @return nothing, but set the node to a new node with reduced equalized
+ * input port sizes (if more than one input port exist)
+ */
+void reduce_input_ports(nnode_t *&node, netlist_t *netlist)
+{
+    oassert(node->num_input_port_sizes == 1 || node->num_input_port_sizes == 2);
+
+    int offset = 0;
+    nnode_t *new_node;
+
+    signal_list_t **input_ports = (signal_list_t **)vtr::calloc(node->num_input_port_sizes, sizeof(signal_list_t *));
+    /* add pins to signals lists */
+    for (int i = 0; i < node->num_input_port_sizes; i++) {
+        /* initialize signal list */
+        input_ports[i] = init_signal_list();
+        for (int j = 0; j < node->input_port_sizes[i]; j++) {
+            add_pin_to_signal_list(input_ports[i], node->input_pins[j + offset]);
+        }
+        offset += node->input_port_sizes[i];
+    }
+
+    /* reduce the first port */
+    input_ports[0] = reduce_signal_list(input_ports[0], node->attributes->port_a_signed, netlist);
+    /* reduce the second port if exist */
+    if (node->num_input_port_sizes == 2) {
+        input_ports[1] = reduce_signal_list(input_ports[1], node->attributes->port_b_signed, netlist);
+
+        /* equalize port sizes */
+        int max = std::max(input_ports[0]->count, input_ports[1]->count);
+
+        while (input_ports[0]->count < max)
+            add_pin_to_signal_list(input_ports[0], get_pad_pin(netlist));
+        while (input_ports[1]->count < max)
+            add_pin_to_signal_list(input_ports[1], get_pad_pin(netlist));
+    }
+
+    /* creating a new node */
+    new_node = (node->num_input_port_sizes == 1)
+                 ? make_1port_gate(node->type, input_ports[0]->count, node->num_output_pins, node, node->traverse_visited)
+                 : make_2port_gate(node->type, input_ports[0]->count, input_ports[1]->count, node->num_output_pins, node, node->traverse_visited);
+
+    /* copy attributes */
+    copy_signedness(new_node->attributes, node->attributes);
+
+    /* hook the input pins */
+    for (int i = 0; i < input_ports[0]->count; i++) {
+        npin_t *pin = input_ports[0]->pins[i];
+        if (pin->node) {
+            /* remap pins to new node */
+            remap_pin_to_new_node(input_ports[0]->pins[i], new_node, i);
+        } else {
+            /* add pins to new node */
+            add_input_pin_to_node(new_node, input_ports[0]->pins[i], i);
+        }
+    }
+    offset = input_ports[0]->count;
+
+    if (node->num_input_port_sizes == 2) {
+        for (int i = 0; i < input_ports[1]->count; i++) {
+            npin_t *pin = input_ports[1]->pins[i];
+            if (pin->node) {
+                /* remap pins to new node */
+                remap_pin_to_new_node(input_ports[1]->pins[i], new_node, i + offset);
+            } else {
+                /* add pins to new node */
+                add_input_pin_to_node(new_node, input_ports[1]->pins[i], i + offset);
+            }
+        }
+    }
+
+    /* hook the output pins */
+    for (int i = 0; i < node->num_output_pins; i++) {
+        remap_pin_to_new_node(node->output_pins[i], new_node, i);
+    }
+
+    // CLEAN UP
+    for (int i = 0; i < node->num_input_port_sizes; i++) {
+        free_signal_list(input_ports[i]);
+    }
+    vtr::free(input_ports);
+    free_nnode(node);
+
+    node = new_node;
+}
+
+/**
+ * (function: reduce_signal_list)
+ *
+ * @brief reduce the size of signal list by removing extra pad pins
+ *
+ * @param signalvar list of signals
+ * @param signedness the signedness of a port corresponding to the signal list
+ * @param netlist pointer to the current netlist
+ *
+ * @return a pruned signal list
+ */
+signal_list_t *reduce_signal_list(signal_list_t *signalvar, operation_list signedness, netlist_t *netlist)
+{
+    /* validate signedness */
+    oassert(signedness == operation_list::SIGNED || signedness == operation_list::UNSIGNED);
+
+    signal_list_t *return_value = init_signal_list();
+    /* specify the extension net */
+    nnet_t *extended_net = (signedness == operation_list::SIGNED) ? netlist->one_net : netlist->zero_net;
+
+    for (int i = signalvar->count - 1; i > -1; i--) {
+        npin_t *pin = signalvar->pins[i];
+        if (pin->net == extended_net) {
+            delete_npin(pin);
+            signalvar->pins[i] = NULL;
+        } else {
+            /* reaching to valuable pins, so break the pruning process */
+            break;
+        }
+    }
+
+    /* adding valuable pins to new signals list */
+    for (int i = 0; i < signalvar->count; i++) {
+        if (signalvar->pins[i]) {
+            add_pin_to_signal_list(return_value, signalvar->pins[i]);
+        }
+    }
+
+    // CLEAN UP
+    free_signal_list(signalvar);
+
+    return (return_value);
+}
+
+chain_information_t *allocate_chain_info()
+{
+    chain_information_t *new_node;
+
+    new_node = (chain_information_t *)my_malloc_struct(sizeof(chain_information_t));
+
+    new_node->name = NULL;
+    new_node->count = 0;
+
+    return new_node;
+}
+
+/**
+ * (function: equalize_port_sizes)
+ *
+ * @brief equalizing the input and output ports for the given node
+ *
+ * NOTE: at max TWO input ports is supported
+ *
+ * @param node pointing to a shift node
+ * @param traverse_mark_number unique traversal mark for blif elaboration pass
+ * @param netlist pointer to the current netlist file
+ */
+void equalize_ports_size(nnode_t *&node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+    oassert(node->num_input_port_sizes > 0 && node->num_input_port_sizes <= 2);
+
+    /**
+     * INPUTS
+     *  A: (width_a)
+     *  B: (width_b) [optional]
+     * OUTPUT
+     *  Y: width_y
+     */
+    /* removing extra pad pins based on the signedness of ports */
+    reduce_input_ports(node, netlist);
+
+    int port_a_size = node->input_port_sizes[0];
+    int port_b_size = -1;
+    if (node->num_input_port_sizes == 2) {
+        port_b_size = node->input_port_sizes[1];
+        /* validate inputport sizes */
+        oassert(port_a_size == port_b_size);
+    }
+
+    int port_y_size = node->output_port_sizes[0];
+
+    /* no change is needed */
+    if (port_a_size == port_y_size)
+        return;
+
+    /* new port size */
+    int new_out_size = port_a_size;
+
+    /* creating the new node */
+    nnode_t *new_node = (port_b_size == -1) ? make_1port_gate(node->type, port_a_size, new_out_size, node, traverse_mark_number)
+                                            : make_2port_gate(node->type, port_a_size, port_b_size, new_out_size, node, traverse_mark_number);
+
+    /* copy signedness attributes */
+    copy_signedness(new_node->attributes, node->attributes);
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        /* remapping the a pins */
+        remap_pin_to_new_node(node->input_pins[i], new_node, i);
+    }
+
+    /* Connecting output pins */
+    for (int i = 0; i < new_out_size; i++) {
+        if (i < port_y_size) {
+            remap_pin_to_new_node(node->output_pins[i], new_node, i);
+        } else {
+            /* need create a new output pin */
+            npin_t *new_pin1 = allocate_npin();
+            npin_t *new_pin2 = allocate_npin();
+            nnet_t *new_net = allocate_nnet();
+            new_net->name = make_full_ref_name(NULL, NULL, NULL, new_node->name, i);
+            /* hook the output pin into the node */
+            add_output_pin_to_node(new_node, new_pin1, i);
+            /* hook up new pin 1 into the new net */
+            add_driver_pin_to_net(new_net, new_pin1);
+            /* hook up the new pin 2 to this new net */
+            add_fanout_pin_to_net(new_net, new_pin2);
+        }
+    }
+
+    if (new_out_size < port_y_size) {
+        for (int i = new_out_size; i < port_y_size; i++) {
+            /* need to drive extra output pins with PAD */
+            nnode_t *buf_node = make_1port_gate(BUF_NODE, 1, 1, node, traverse_mark_number);
+            /* hook a pin from PAD node into the buf node */
+            add_input_pin_to_node(buf_node, get_pad_pin(netlist), 0);
+            /* remap the extra output pin to buf node */
+            remap_pin_to_new_node(node->output_pins[i], buf_node, 0);
+        }
+    }
+
+    // CLEAN UP
+    free_nnode(node);
+
+    node = new_node;
+}
+
+void remove_fanout_pins_from_net(nnet_t *net, npin_t * /*pin*/, int id)
+{
+    int i;
+    for (i = id; i < net->num_fanout_pins - 1; i++) {
+        net->fanout_pins[i] = net->fanout_pins[i + 1];
+        if (net->fanout_pins[i] != NULL)
+            net->fanout_pins[i]->pin_net_idx = i;
+    }
+    net->fanout_pins[i] = NULL;
+    net->num_fanout_pins--;
+}
+
+void delete_npin(npin_t *pin)
+{
+    if (pin->type == INPUT) {
+        /* detach from its node */
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+        /* detach from its net */
+        pin->net->fanout_pins[pin->pin_net_idx] = NULL;
+    } else if (pin->type == OUTPUT) {
+        /* detach from its node */
+        pin->node->output_pins[pin->pin_node_idx] = NULL;
+        /* detach from its net */
+        pin->net->driver_pins[pin->pin_net_idx] = NULL;
+    }
+    // CLEAN UP
+    free_npin(pin);
+}
\ No newline at end of file
diff --git a/parmys-plugin/netlist/netlist_utils.h b/parmys-plugin/netlist/netlist_utils.h
new file mode 100644
index 000000000..38daf05e7
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_utils.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _NETLIST_UTILS_H_
+#define _NETLIST_UTILS_H_
+
+#include "odin_types.h"
+
+nnode_t *allocate_nnode(loc_t loc);
+npin_t *allocate_npin();
+nnet_t *allocate_nnet();
+
+nnode_t *free_nnode(nnode_t *to_free);
+npin_t *free_npin(npin_t *to_free);
+nnet_t *free_nnet(nnet_t *to_free);
+
+npin_t *get_zero_pin(netlist_t *netlist);
+npin_t *get_pad_pin(netlist_t *netlist);
+npin_t *get_one_pin(netlist_t *netlist);
+npin_t *copy_input_npin(npin_t *copy_pin);
+npin_t *copy_output_npin(npin_t *copy_pin);
+
+void allocate_more_input_pins(nnode_t *node, int width);
+void allocate_more_output_pins(nnode_t *node, int width);
+
+void move_input_pin(nnode_t *node, int old_idx, int new_idx);
+void move_output_pin(nnode_t *node, int old_idx, int new_idx);
+void add_input_pin_to_node(nnode_t *node, npin_t *pin, int pin_idx);
+void add_fanout_pin_to_net(nnet_t *net, npin_t *pin);
+void add_output_pin_to_node(nnode_t *node, npin_t *pin, int pin_idx);
+void add_driver_pin_to_net(nnet_t *net, npin_t *pin);
+void add_output_port_information(nnode_t *node, int port_width);
+void add_input_port_information(nnode_t *node, int port_width);
+
+void join_nets(nnet_t *net, nnet_t *input_net);
+
+void remap_pin_to_new_net(npin_t *pin, nnet_t *new_net);
+void remap_pin_to_new_node(npin_t *pin, nnode_t *new_node, int pin_idx);
+
+attr_t *init_attribute();
+void copy_attribute(attr_t *to, attr_t *copy);
+void copy_signedness(attr_t *to, attr_t *copy);
+void free_attribute(attr_t *attribute);
+
+signal_list_t *init_signal_list();
+extern bool is_constant_signal(signal_list_t *signal, netlist_t *netlist);
+extern long constant_signal_value(signal_list_t *signal, netlist_t *netlist);
+extern signal_list_t *create_constant_signal(const long long value, const int desired_width, netlist_t *netlist);
+extern signal_list_t *prune_signal(signal_list_t *signalsvar, long signal_width, long prune_size, int num_of_signals);
+extern signal_list_t **split_signal_list(signal_list_t *signalsvar, const int width);
+extern bool sigcmp(signal_list_t *sig, signal_list_t *be_checked);
+void add_pin_to_signal_list(signal_list_t *list, npin_t *pin);
+signal_list_t *combine_lists(signal_list_t **signal_lists, int num_signal_lists);
+signal_list_t *copy_input_signals(signal_list_t *signalsvar);
+void free_signal_list(signal_list_t *list);
+
+signal_list_t *make_output_pins_for_existing_node(nnode_t *node, int width);
+void connect_nodes(nnode_t *out_node, int out_idx, nnode_t *in_node, int in_idx);
+
+netlist_t *allocate_netlist();
+void free_netlist(netlist_t *to_free);
+
+int get_output_pin_index_from_mapping(nnode_t *node, const char *name);
+int get_output_port_index_from_mapping(nnode_t *node, const char *name);
+int get_input_pin_index_from_mapping(nnode_t *node, const char *name);
+int get_input_port_index_from_mapping(nnode_t *node, const char *name);
+extern npin_t *legalize_polarity(npin_t *pin, edge_type_e pin_polarity, nnode_t *node);
+extern void reduce_input_ports(nnode_t *&node, netlist_t *netlist);
+extern signal_list_t *reduce_signal_list(signal_list_t *signalvar, operation_list signedness, netlist_t *netlist);
+chain_information_t *allocate_chain_info();
+void remove_fanout_pins_from_net(nnet_t *net, npin_t *pin, int id);
+
+extern void equalize_ports_size(nnode_t *&node, uintptr_t traverse_mark_number, netlist_t *netlist);
+extern void delete_npin(npin_t *pin);
+
+#endif // _NETLIST_UTILS_H_
diff --git a/parmys-plugin/netlist/netlist_visualizer.cc b/parmys-plugin/netlist/netlist_visualizer.cc
new file mode 100644
index 000000000..305cd2d5b
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_visualizer.cc
@@ -0,0 +1,349 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "netlist_utils.h"
+#include "netlist_visualizer.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+void depth_first_traverse_visualize(nnode_t *node, FILE *fp, uintptr_t traverse_mark_number);
+void depth_first_traversal_graph_display(FILE *out, uintptr_t marker_value, netlist_t *netllist);
+
+void forward_traversal_net_graph_display(FILE *out, uintptr_t marker_value, nnode_t *node);
+void backward_traversal_net_graph_display(FILE *out, uintptr_t marker_value, nnode_t *node);
+
+/*---------------------------------------------------------------------------------------------
+ * (function: graphVizOutputNetlist)
+ *-------------------------------------------------------------------------------------------*/
+void graphVizOutputNetlist(std::string path, const char *name, uintptr_t marker_value, netlist_t *netlist)
+{
+    char path_and_file[4096];
+    FILE *fp;
+
+    /* open the file */
+    odin_sprintf(path_and_file, "%s/%s.dot", path.c_str(), name);
+    fp = fopen(path_and_file, "w");
+
+    /* open graph */
+    fprintf(fp, "digraph G {\n\tranksep=.25;\n");
+
+    depth_first_traversal_graph_display(fp, marker_value, netlist);
+
+    /* close graph */
+    fprintf(fp, "}\n");
+    fclose(fp);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: depth_first_traversal_start()
+ *-------------------------------------------------------------------------------------------*/
+void depth_first_traversal_graph_display(FILE *out, uintptr_t marker_value, netlist_t *netlist)
+{
+    /* start with the primary input list */
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        if (netlist->top_input_nodes[i] != NULL) {
+            depth_first_traverse_visualize(netlist->top_input_nodes[i], out, marker_value);
+        }
+    }
+    /* now traverse the ground and vcc pins */
+    if (netlist->gnd_node != NULL)
+        depth_first_traverse_visualize(netlist->gnd_node, out, marker_value);
+    if (netlist->vcc_node != NULL)
+        depth_first_traverse_visualize(netlist->vcc_node, out, marker_value);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: depth_first_traverse)
+ *-------------------------------------------------------------------------------------------*/
+void depth_first_traverse_visualize(nnode_t *node, FILE *fp, uintptr_t traverse_mark_number)
+{
+    nnode_t *next_node;
+    nnet_t *next_net;
+
+    if (node->traverse_visited == traverse_mark_number) {
+        return;
+    } else {
+        /* ELSE - this is a new node so depth visit it */
+        char *temp_string;
+        char *temp_string2;
+
+        node->traverse_visited = traverse_mark_number;
+
+        temp_string = vtr::strdup(make_simple_name(node->name, "^-+.", '_').c_str());
+        if ((node->type == FF_NODE) || (node->type == BUF_NODE)) {
+            fprintf(fp, "\t\"%s\" [shape=box];\n", temp_string);
+        } else if (node->type == INPUT_NODE) {
+            fprintf(fp, "\t\"%s\" [shape=triangle];\n", temp_string);
+        } else if (node->type == CLOCK_NODE) {
+            fprintf(fp, "\t\"%s\" [shape=triangle];\n", temp_string);
+        } else if (node->type == OUTPUT_NODE) {
+            fprintf(fp, "\t\"%s_O\" [shape=triangle];\n", temp_string);
+        } else {
+            fprintf(fp, "\t\"%s\"\n", temp_string);
+        }
+        vtr::free(temp_string);
+
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->net == NULL)
+                continue;
+
+            next_net = node->output_pins[i]->net;
+            for (int j = 0; j < next_net->num_fanout_pins; j++) {
+                npin_t *pin = next_net->fanout_pins[j];
+                if (pin) {
+                    next_node = pin->node;
+                    if (next_node == NULL)
+                        continue;
+                    // To see just combinational stuff...also comment above triangels and box
+                    //				if ((next_node->type == FF_NODE) || (next_node->type == INPUT_NODE) || (next_node->type == OUTPUT_NODE))
+                    //					continue;
+                    //				if ((node->type == FF_NODE) || (node->type == INPUT_NODE) || (node->type == OUTPUT_NODE))
+                    //					continue;
+
+                    temp_string = vtr::strdup(make_simple_name(node->name, "^-+.", '_').c_str());
+                    temp_string2 = vtr::strdup(make_simple_name(next_node->name, "^-+.", '_').c_str());
+                    /* renaming for output nodes */
+                    if (node->type == OUTPUT_NODE) {
+                        /* renaming for output nodes */
+                        char *temp_string_old = temp_string;
+                        temp_string = (char *)vtr::malloc(sizeof(char) * strlen(temp_string) + 1 + 2);
+                        odin_sprintf(temp_string, "%s_O", temp_string_old);
+                        free(temp_string_old);
+                    }
+                    if (next_node->type == OUTPUT_NODE) {
+                        /* renaming for output nodes */
+                        char *temp_string2_old = temp_string2;
+                        temp_string2 = (char *)vtr::malloc(sizeof(char) * strlen(temp_string2) + 1 + 2);
+                        odin_sprintf(temp_string2, "%s_O", temp_string2_old);
+                        free(temp_string2_old);
+                    }
+
+                    fprintf(fp, "\t\"%s\" -> \"%s\"", temp_string, temp_string2);
+                    if (next_net->fanout_pins[j]->name)
+                        fprintf(fp, "[label=\"%s\"]", next_net->fanout_pins[j]->name);
+                    fprintf(fp, ";\n");
+
+                    vtr::free(temp_string);
+                    vtr::free(temp_string2);
+
+                    depth_first_traverse_visualize(next_node, fp, traverse_mark_number);
+                }
+            }
+        }
+    }
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: graphVizOutputCobinationalNet)
+ *-------------------------------------------------------------------------------------------*/
+void graphVizOutputCombinationalNet(std::string path, const char *name, uintptr_t marker_value, nnode_t *current_node)
+{
+    char path_and_file[4096];
+    FILE *fp;
+
+    /* open the file */
+    odin_sprintf(path_and_file, "%s/%s.dot", path.c_str(), name);
+    fp = fopen(path_and_file, "w");
+
+    /* open graph */
+    fprintf(fp, "digraph G {\n\tranksep=.25;\n");
+
+    forward_traversal_net_graph_display(fp, marker_value, current_node);
+    backward_traversal_net_graph_display(fp, marker_value, current_node);
+
+    /* close graph */
+    fprintf(fp, "}\n");
+    fclose(fp);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: forward_traversal_net_graph_display()
+ *	TODO check if stack of node is freed
+ *-------------------------------------------------------------------------------------------*/
+void forward_traversal_net_graph_display(FILE *fp, uintptr_t marker_value, nnode_t *node)
+{
+    nnode_t **stack_of_nodes;
+    int index_in_stack = 0;
+    int num_stack_of_nodes = 1;
+    char *temp_string;
+    char *temp_string2;
+
+    stack_of_nodes = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * 1);
+    stack_of_nodes[0] = node;
+
+    while (index_in_stack != num_stack_of_nodes) {
+        nnode_t *current_node = stack_of_nodes[index_in_stack];
+
+        /* mark it */
+        current_node->traverse_visited = marker_value;
+
+        /* printout the details of it */
+        temp_string = vtr::strdup(make_simple_name(current_node->name, "^-+.", '_').c_str());
+        if (index_in_stack == 0) {
+            fprintf(fp, "\t%s [shape=box,color=red];\n", temp_string);
+        } else if ((current_node->type == FF_NODE) || (current_node->type == BUF_NODE)) {
+            fprintf(fp, "\t%s [shape=box];\n", temp_string);
+        } else if (current_node->type == INPUT_NODE) {
+            fprintf(fp, "\t%s [shape=triangle];\n", temp_string);
+        } else if (current_node->type == CLOCK_NODE) {
+            fprintf(fp, "\t%s [shape=triangle];\n", temp_string);
+        } else if (current_node->type == OUTPUT_NODE) {
+            fprintf(fp, "\t%s_O [shape=triangle];\n", temp_string);
+        } else {
+            fprintf(fp, "\t%s [label=\"%d:%d\"];\n", temp_string, current_node->forward_level, current_node->backward_level);
+        }
+        vtr::free(temp_string);
+
+        /* at each node visit all the outputs */
+        for (int j = 0; j < current_node->num_output_pins; j++) {
+            if (current_node->output_pins[j] == NULL)
+                continue;
+
+            for (int k = 0; k < current_node->output_pins[j]->net->num_fanout_pins; k++) {
+                if ((current_node->output_pins[j] == NULL) || (current_node->output_pins[j]->net == NULL) ||
+                    (current_node->output_pins[j]->net->fanout_pins[k] == NULL))
+                    continue;
+
+                /* visit the fanout point */
+                nnode_t *next_node = current_node->output_pins[j]->net->fanout_pins[k]->node;
+
+                if (next_node == NULL)
+                    continue;
+
+                temp_string = vtr::strdup(make_simple_name(current_node->name, "^-+.", '_').c_str());
+                temp_string2 = vtr::strdup(make_simple_name(next_node->name, "^-+.", '_').c_str());
+                if (current_node->type == OUTPUT_NODE) {
+                    /* renaming for output nodes */
+                    temp_string = (char *)vtr::realloc(temp_string, sizeof(char) * strlen(temp_string) + 1 + 2);
+                    odin_sprintf(temp_string, "%s_O", temp_string);
+                }
+                if (next_node->type == OUTPUT_NODE) {
+                    /* renaming for output nodes */
+                    temp_string2 = (char *)vtr::realloc(temp_string2, sizeof(char) * strlen(temp_string2) + 1 + 2);
+                    odin_sprintf(temp_string2, "%s_O", temp_string2);
+                }
+
+                fprintf(fp, "\t%s -> %s [label=\"%s\"];\n", temp_string, temp_string2, current_node->output_pins[j]->net->fanout_pins[k]->name);
+
+                vtr::free(temp_string);
+                vtr::free(temp_string2);
+
+                if ((next_node->traverse_visited != marker_value) && (next_node->type != FF_NODE)) {
+                    /* IF - not visited yet then add to list */
+                    stack_of_nodes = (nnode_t **)vtr::realloc(stack_of_nodes, sizeof(nnode_t *) * (num_stack_of_nodes + 1));
+                    stack_of_nodes[num_stack_of_nodes] = next_node;
+                    num_stack_of_nodes++;
+                }
+            }
+        }
+
+        /* process next element in net */
+        index_in_stack++;
+    }
+
+    for (int i = 0; i < num_stack_of_nodes; i++) {
+        free_nnode(stack_of_nodes[i]);
+    }
+    vtr::free(stack_of_nodes);
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: backward_traversal_net_graph_display()
+ *-------------------------------------------------------------------------------------------*/
+void backward_traversal_net_graph_display(FILE *fp, uintptr_t marker_value, nnode_t *node)
+{
+    char *temp_string;
+    char *temp_string2;
+    nnode_t **stack_of_nodes;
+    int index_in_stack = 0;
+    int num_stack_of_nodes = 1;
+
+    stack_of_nodes = (nnode_t **)vtr::malloc(sizeof(nnode_t *) * 1);
+    stack_of_nodes[0] = node;
+
+    while (index_in_stack != num_stack_of_nodes) {
+        nnode_t *current_node = stack_of_nodes[index_in_stack];
+
+        /* mark it */
+        current_node->traverse_visited = marker_value;
+
+        /* printout the details of it */
+        temp_string = vtr::strdup(make_simple_name(current_node->name, "^-+.", '_').c_str());
+        if (index_in_stack != 0) {
+            if ((current_node->type == FF_NODE) || (current_node->type == BUF_NODE)) {
+                fprintf(fp, "\t%s [shape=box];\n", temp_string);
+            } else if (current_node->type == INPUT_NODE) {
+                fprintf(fp, "\t%s [shape=triangle];\n", temp_string);
+            } else if (current_node->type == CLOCK_NODE) {
+                fprintf(fp, "\t%s [shape=triangle];\n", temp_string);
+            } else if (current_node->type == OUTPUT_NODE) {
+                fprintf(fp, "\t%s_O [shape=triangle];\n", temp_string);
+            } else {
+                fprintf(fp, "\t%s [label=\"%d:%d\"];\n", temp_string, current_node->forward_level, current_node->backward_level);
+            }
+        }
+        vtr::free(temp_string);
+
+        /* at each node visit all the outputs */
+        for (int j = 0; j < current_node->num_input_pins; j++) {
+            if (current_node->input_pins[j] == NULL)
+                continue;
+
+            if ((current_node->input_pins[j] == NULL) || (current_node->input_pins[j]->net == NULL) ||
+                (current_node->input_pins[j]->net->num_driver_pins == 0))
+                continue;
+
+            /* visit the fanout point */
+
+            for (int k = 0; k < current_node->input_pins[j]->net->num_driver_pins; k++) {
+                nnode_t *next_node = current_node->input_pins[j]->net->driver_pins[k]->node;
+                if (next_node == NULL)
+                    continue;
+
+                temp_string = vtr::strdup(make_simple_name(current_node->name, "^-+.", '_').c_str());
+                temp_string2 = vtr::strdup(make_simple_name(next_node->name, "^-+.", '_').c_str());
+
+                fprintf(fp, "\t%s -> %s [label=\"%s\"];\n", temp_string2, temp_string, current_node->input_pins[j]->name);
+
+                vtr::free(temp_string);
+                vtr::free(temp_string2);
+
+                if ((next_node->traverse_visited != marker_value) && (next_node->type != FF_NODE)) {
+                    /* IF - not visited yet then add to list */
+                    stack_of_nodes = (nnode_t **)vtr::realloc(stack_of_nodes, sizeof(nnode_t *) * (num_stack_of_nodes + 1));
+                    stack_of_nodes[num_stack_of_nodes] = next_node;
+                    num_stack_of_nodes++;
+                }
+            }
+        }
+
+        /* process next element in net */
+        index_in_stack++;
+    }
+
+    for (int i = 0; i < num_stack_of_nodes; i++) {
+        free_nnode(stack_of_nodes[i]);
+    }
+    vtr::free(stack_of_nodes);
+}
diff --git a/parmys-plugin/netlist/netlist_visualizer.h b/parmys-plugin/netlist/netlist_visualizer.h
new file mode 100644
index 000000000..545653352
--- /dev/null
+++ b/parmys-plugin/netlist/netlist_visualizer.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _NETLIST_VISUALIZER_H_
+#define _NETLIST_VISUALIZER_H_
+
+#include "odin_types.h"
+#include <string>
+
+void graphVizOutputNetlist(std::string path, const char *name, uintptr_t marker_value, netlist_t *input_netlist);
+void graphVizOutputCombinationalNet(std::string path, const char *name, uintptr_t marker_value, nnode_t *current_node);
+
+#endif // _NETLIST_VISUALIZER_H_
diff --git a/parmys-plugin/netlist/node_utils.cc b/parmys-plugin/netlist/node_utils.cc
new file mode 100644
index 000000000..a6326bd5f
--- /dev/null
+++ b/parmys-plugin/netlist/node_utils.cc
@@ -0,0 +1,405 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "node_utils.h"
+#include "netlist_utils.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "odin_util.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+long unique_node_name_id = 0;
+
+/*-----------------------------------------------------------------------
+ * (function: get_a_pad_pin)
+ * 	this allows us to attach to the constant netlist driving hb_pad
+ *---------------------------------------------------------------------*/
+npin_t *get_pad_pin(netlist_t *netlist)
+{
+    npin_t *pad_fanout_pin = allocate_npin();
+    pad_fanout_pin->name = vtr::strdup(pad_string);
+    add_fanout_pin_to_net(netlist->pad_net, pad_fanout_pin);
+    return pad_fanout_pin;
+}
+
+/*-----------------------------------------------------------------------
+ * (function: get_a_zero_pin)
+ * 	this allows us to attach to the constant netlist driving zero
+ *---------------------------------------------------------------------*/
+npin_t *get_zero_pin(netlist_t *netlist)
+{
+    npin_t *zero_fanout_pin = allocate_npin();
+    zero_fanout_pin->name = vtr::strdup(zero_string);
+    add_fanout_pin_to_net(netlist->zero_net, zero_fanout_pin);
+    return zero_fanout_pin;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: get_a_one_pin)
+ * 	this allows us to attach to the constant netlist driving one
+ *-------------------------------------------------------------------------------------------*/
+npin_t *get_one_pin(netlist_t *netlist)
+{
+    npin_t *one_fanout_pin = allocate_npin();
+    one_fanout_pin->name = vtr::strdup(one_string);
+    add_fanout_pin_to_net(netlist->one_net, one_fanout_pin);
+    return one_fanout_pin;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_not_gate_with_input)
+ * 	Creates a not gate and attaches it to the inputs
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_not_gate_with_input(npin_t *input_pin, nnode_t *node, short mark)
+{
+    nnode_t *logic_node;
+
+    logic_node = make_not_gate(node, mark);
+
+    /* add the input ports as needed */
+    add_input_pin_to_node(logic_node, input_pin, 0);
+
+    return logic_node;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: make_not_gate)
+ * 	Just make a not gate
+ *-----------------------------------------------------------------------*/
+nnode_t *make_not_gate(nnode_t *node, short mark)
+{
+    nnode_t *logic_node;
+
+    logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = LOGICAL_NOT;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    allocate_more_input_pins(logic_node, 1);
+    allocate_more_output_pins(logic_node, 1);
+
+    return logic_node;
+}
+
+/**
+ * -------------------------------------------------------------------------
+ * (function: make_not_gate)
+ * @brief Making a not gate with the given pin as input
+ * and a new pin allocated as the output
+ *
+ * @param pin input pin
+ * @param node related netlist node
+ * @param mark netlist traversal number
+ *
+ * @return not node
+ *-----------------------------------------------------------------------*/
+nnode_t *make_inverter(npin_t *pin, nnode_t *node, short mark)
+{
+    /* validate the input pin */
+    oassert(pin->type == INPUT);
+
+    nnode_t *logic_node;
+
+    logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = LOGICAL_NOT;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    allocate_more_input_pins(logic_node, 1);
+    allocate_more_output_pins(logic_node, 1);
+
+    if (pin->node)
+        pin->node->input_pins[pin->pin_node_idx] = NULL;
+
+    /* hook the pin into the not node */
+    add_input_pin_to_node(logic_node, pin, 0);
+
+    /* connecting the not_node output pin */
+    npin_t *new_pin1 = allocate_npin();
+    npin_t *new_pin2 = allocate_npin();
+    nnet_t *new_net = allocate_nnet();
+    new_net->name = make_full_ref_name(NULL, NULL, NULL, logic_node->name, 0);
+    /* hook the output pin into the node */
+    add_output_pin_to_node(logic_node, new_pin1, 0);
+    /* hook up new pin 1 into the new net */
+    add_driver_pin_to_net(new_net, new_pin1);
+    /* hook up the new pin 2 to this new net */
+    add_fanout_pin_to_net(new_net, new_pin2);
+
+    return logic_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_1port_gate)
+ * 	Make a 1 port gate with variable sizes
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_1port_gate(operation_list type, int width_input, int width_output, nnode_t *node, short mark)
+{
+    nnode_t *logic_node;
+
+    logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = type;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    /* add the input ports as needed */
+    allocate_more_input_pins(logic_node, width_input);
+    add_input_port_information(logic_node, width_input);
+    /* add output */
+    allocate_more_output_pins(logic_node, width_output);
+    add_output_port_information(logic_node, width_output);
+
+    return logic_node;
+}
+/*---------------------------------------------------------------------------------------------
+ * (function: make_1port_logic_gate)
+ * 	Make a gate with variable sized inputs and 1 output
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_1port_logic_gate(operation_list type, int width, nnode_t *node, short mark)
+{
+    nnode_t *logic_node = make_1port_gate(type, width, 1, node, mark);
+
+    return logic_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_3port_logic_gates)
+ * 	Make a 3 port gate all variable port widths.
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_3port_gate(operation_list type, int width_port1, int width_port2, int width_port3, int width_output, nnode_t *node, short mark)
+{
+    nnode_t *logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = type;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    /* add the input ports as needed */
+    allocate_more_input_pins(logic_node, width_port1);
+    add_input_port_information(logic_node, width_port1);
+    allocate_more_input_pins(logic_node, width_port2);
+    add_input_port_information(logic_node, width_port2);
+    allocate_more_input_pins(logic_node, width_port3);
+    add_input_port_information(logic_node, width_port3);
+    /* add output */
+    allocate_more_output_pins(logic_node, width_output);
+    add_output_port_information(logic_node, width_output);
+
+    return logic_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_2port_logic_gates)
+ * 	Make a 2 port gate with variable sizes.  The first port will be input_pins index 0..width_port1.
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_2port_gate(operation_list type, int width_port1, int width_port2, int width_output, nnode_t *node, short mark)
+{
+    nnode_t *logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = type;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    /* add the input ports as needed */
+    allocate_more_input_pins(logic_node, width_port1);
+    add_input_port_information(logic_node, width_port1);
+    allocate_more_input_pins(logic_node, width_port2);
+    add_input_port_information(logic_node, width_port2);
+    /* add output */
+    allocate_more_output_pins(logic_node, width_output);
+    add_output_port_information(logic_node, width_output);
+
+    return logic_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_nport_logic_gates)
+ * 	Make a n port gate with variable sizes.  The first port will be input_pins index 0..width_port1.
+ *-------------------------------------------------------------------------------------------*/
+nnode_t *make_nport_gate(operation_list type, int port_sizes, int width, int width_output, nnode_t *node, short mark)
+{
+    nnode_t *logic_node = allocate_nnode(node->loc);
+    logic_node->traverse_visited = mark;
+    logic_node->type = type;
+    logic_node->name = node_name(logic_node, node->name);
+    logic_node->related_ast_node = node->related_ast_node;
+
+    /* add the input ports as needed */
+    for (int i = 0; i < port_sizes; i++) {
+        allocate_more_input_pins(logic_node, width);
+        add_input_port_information(logic_node, width);
+    }
+    // allocate_more_input_pins(logic_node, width_port2);
+    // add_input_port_information(logic_node, width_port2);
+    /* add output */
+    allocate_more_output_pins(logic_node, width_output);
+    add_output_port_information(logic_node, width_output);
+
+    return logic_node;
+}
+
+const char *edge_type_blif_str(edge_type_e edge_type, loc_t loc)
+{
+    switch (edge_type) {
+    case FALLING_EDGE_SENSITIVITY:
+        return "fe";
+    case RISING_EDGE_SENSITIVITY:
+        return "re";
+    case ACTIVE_HIGH_SENSITIVITY:
+        return "ah";
+    case ACTIVE_LOW_SENSITIVITY:
+        return "al";
+    case ASYNCHRONOUS_SENSITIVITY:
+        return "as";
+    default:
+        error_message(NETLIST, loc, "undefined sensitivity kind for flip flop %s", edge_type_e_STR[edge_type]);
+
+        return NULL;
+    }
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: node_name)
+ * 	This creates the unique node name
+ *-------------------------------------------------------------------------------------------*/
+char *node_name(nnode_t *node, char *instance_name_prefix) { return op_node_name(node->type, instance_name_prefix); }
+
+/*---------------------------------------------------------------------------------------------
+ * (function: node_name)
+ * 	This creates the unique node name
+ *-------------------------------------------------------------------------------------------*/
+char *op_node_name(operation_list op, char *instance_prefix_name)
+{
+    char *return_node_name;
+
+    /* create the unique name for this node */
+    return_node_name = make_full_ref_name(instance_prefix_name, NULL, NULL, name_based_on_op(op), unique_node_name_id);
+
+    unique_node_name_id++;
+
+    return return_node_name;
+}
+
+/**
+ * (function: make_multiport_mux)
+ *
+ * @brief make a multiport mux with given selector and signals lists
+ *
+ * @param inputs list of input signal lists
+ * @param selector signal list of selector pins
+ * @param num_muxed_inputs num of inputs to be muxxed
+ * @param outs list of outputs signals
+ * @param node pointing to the src node
+ * @param netlist pointer to the netlist
+ *
+ * @return mux node
+ */
+nnode_t *make_multiport_smux(signal_list_t **inputs, signal_list_t *selector, int num_muxed_inputs, signal_list_t *outs, nnode_t *node,
+                             netlist_t *netlist)
+{
+    /* validation */
+    int valid_num_mux_inputs = shift_left_value_with_overflow_check(0X1, selector->count, node->loc);
+    oassert(valid_num_mux_inputs >= num_muxed_inputs);
+
+    int offset = 0;
+
+    nnode_t *mux = allocate_nnode(node->loc);
+    mux->type = MULTIPORT_nBIT_SMUX;
+    mux->name = node_name(mux, node->name);
+    mux->traverse_visited = node->traverse_visited;
+
+    /* add selector signal */
+    add_input_port_information(mux, selector->count);
+    allocate_more_input_pins(mux, selector->count);
+    for (int i = 0; i < selector->count; i++) {
+        npin_t *sel = selector->pins[i];
+        /* hook selector into mux node as first port */
+        if (sel->node)
+            remap_pin_to_new_node(sel, mux, i);
+        else
+            add_input_pin_to_node(mux, sel, i);
+    }
+    offset += selector->count;
+
+    int max_width = 0;
+    for (int i = 0; i < num_muxed_inputs; i++) {
+        /* keep the size of max input to allocate equal output */
+        if (inputs[i]->count > max_width)
+            max_width = inputs[i]->count;
+    }
+
+    for (int i = 0; i < num_muxed_inputs; i++) {
+        /* add input port data */
+        add_input_port_information(mux, max_width);
+        allocate_more_input_pins(mux, max_width);
+
+        for (int j = 0; j < inputs[i]->count; j++) {
+            npin_t *pin = inputs[i]->pins[j];
+            /* hook inputs into mux node */
+            if (j < max_width) {
+                if (pin->node)
+                    remap_pin_to_new_node(pin, mux, j + offset);
+                else
+                    add_input_pin_to_node(mux, pin, j + offset);
+            } else {
+                /* pad with PAD node */
+                add_input_pin_to_node(mux, get_pad_pin(netlist), j + offset);
+            }
+        }
+        /* record offset to have correct idx for adding pins */
+        offset += max_width;
+    }
+
+    /* add output info */
+    add_output_port_information(mux, max_width);
+    allocate_more_output_pins(mux, max_width);
+
+    // specify output pin
+    if (outs != NULL) {
+        for (int i = 0; i < outs->count; i++) {
+            npin_t *output_pin;
+            if (i < max_width) {
+                output_pin = outs->pins[i];
+                if (output_pin->node)
+                    remap_pin_to_new_node(output_pin, mux, i);
+                else
+                    add_output_pin_to_node(mux, output_pin, i);
+            } else {
+                output_pin = allocate_npin();
+                nnet_t *output_net = allocate_nnet();
+                /* add output driver to the output net */
+                add_driver_pin_to_net(output_net, output_pin);
+                /* add output pin to the mux node */
+                add_output_pin_to_node(mux, output_pin, i);
+            }
+        }
+    } else {
+        signal_list_t *outputs = make_output_pins_for_existing_node(mux, max_width);
+        // CLEAN UP
+        free_signal_list(outputs);
+    }
+
+    return (mux);
+}
diff --git a/parmys-plugin/netlist/node_utils.h b/parmys-plugin/netlist/node_utils.h
new file mode 100644
index 000000000..a456e0dd3
--- /dev/null
+++ b/parmys-plugin/netlist/node_utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _NODE_UTILS_H_
+#define _NODE_UTILS_H_
+
+#include "odin_types.h"
+
+nnode_t *make_not_gate_with_input(npin_t *input_pin, nnode_t *node, short mark);
+
+nnode_t *make_not_gate(nnode_t *node, short mark);
+nnode_t *make_inverter(npin_t *pin, nnode_t *node, short mark);
+nnode_t *make_1port_logic_gate(operation_list type, int width, nnode_t *node, short mark);
+
+nnode_t *make_1port_gate(operation_list type, int width_input, int width_output, nnode_t *node, short mark);
+nnode_t *make_2port_gate(operation_list type, int width_port1, int width_port2, int width_output, nnode_t *node, short mark);
+nnode_t *make_3port_gate(operation_list type, int width_port1, int width_port2, int width_port3, int width_output, nnode_t *node, short mark);
+nnode_t *make_nport_gate(operation_list type, int port_sizes, int width, int width_output, nnode_t *node, short mark);
+
+char *node_name(nnode_t *node, char *instance_prefix_name);
+char *op_node_name(operation_list op, char *instance_prefix_name);
+
+const char *edge_type_blif_str(edge_type_e edge_type, loc_t loc);
+
+extern nnode_t *make_multiport_smux(signal_list_t **inputs, signal_list_t *selector, int num_muxed_inputs, signal_list_t *outs, nnode_t *node,
+                                    netlist_t *netlist);
+#endif // _NODE_UTILS_H_
diff --git a/parmys-plugin/parmys.cc b/parmys-plugin/parmys.cc
new file mode 100644
index 000000000..2084d8190
--- /dev/null
+++ b/parmys-plugin/parmys.cc
@@ -0,0 +1,1184 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "kernel/celltypes.h"
+#include "kernel/yosys.h"
+
+#include <regex>
+
+#include "netlist_utils.h"
+#include "odin_globals.h"
+#include "odin_ii.h"
+#include "odin_util.h"
+
+#include "vtr_memory.h"
+#include "vtr_path.h"
+#include "vtr_util.h"
+
+#include "partial_map.h"
+
+#include "netlist_visualizer.h"
+
+#include "parmys_resolve.h"
+
+#include "adder.h"
+#include "arch_util.h"
+#include "block_memory.h"
+#include "hard_block.h"
+#include "memory.h"
+#include "multiplier.h"
+#include "netlist_cleanup.h"
+#include "netlist_statistic.h"
+#include "netlist_visualizer.h"
+#include "read_xml_config_file.h"
+#include "subtractor.h"
+
+#include "ast_util.h"
+#include "parmys_update.h"
+#include "parmys_utils.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+#define GND_NAME "$false"
+#define VCC_NAME "$true"
+#define HBPAD_NAME "$undef"
+
+struct Bbox {
+    std::string name;
+    std::vector<std::string> inputs, outputs;
+};
+
+CellTypes ct;
+
+struct ParMYSPass : public Pass {
+
+    static void hook_up_nets(netlist_t *odin_netlist, Hashtable *output_nets_hash)
+    {
+        for (int i = 0; i < odin_netlist->num_internal_nodes; i++) {
+            nnode_t *node = odin_netlist->internal_nodes[i];
+            hook_up_node(node, output_nets_hash);
+        }
+
+        for (int i = 0; i < odin_netlist->num_ff_nodes; i++) {
+            nnode_t *node = odin_netlist->ff_nodes[i];
+            hook_up_node(node, output_nets_hash);
+        }
+
+        for (int i = 0; i < odin_netlist->num_top_output_nodes; i++) {
+            nnode_t *node = odin_netlist->top_output_nodes[i];
+            hook_up_node(node, output_nets_hash);
+        }
+    }
+
+    static void hook_up_node(nnode_t *node, Hashtable *output_nets_hash)
+    {
+        for (int j = 0; j < node->num_input_pins; j++) {
+            npin_t *input_pin = node->input_pins[j];
+
+            nnet_t *output_net = (nnet_t *)output_nets_hash->get(input_pin->name);
+
+            if (!output_net) {
+                log_error("Error: Could not hook up the pin %s: not available, related node: %s.", input_pin->name, node->name);
+            }
+            add_fanout_pin_to_net(output_net, input_pin);
+        }
+    }
+
+    static void build_top_output_node(const char *name_str, netlist_t *odin_netlist)
+    {
+        nnode_t *new_node = allocate_nnode(my_location);
+        new_node->related_ast_node = NULL;
+        new_node->type = OUTPUT_NODE;
+        new_node->name = vtr::strdup(name_str);
+        allocate_more_input_pins(new_node, 1);
+        add_input_port_information(new_node, 1);
+
+        npin_t *new_pin = allocate_npin();
+        new_pin->name = vtr::strdup(name_str);
+        add_input_pin_to_node(new_node, new_pin, 0);
+
+        odin_netlist->top_output_nodes =
+          (nnode_t **)vtr::realloc(odin_netlist->top_output_nodes, sizeof(nnode_t *) * (odin_netlist->num_top_output_nodes + 1));
+        odin_netlist->top_output_nodes[odin_netlist->num_top_output_nodes++] = new_node;
+    }
+
+    static void build_top_input_node(const char *name_str, netlist_t *odin_netlist, Hashtable *output_nets_hash)
+    {
+        loc_t my_loc;
+        nnode_t *new_node = allocate_nnode(my_loc);
+
+        new_node->related_ast_node = NULL;
+        new_node->type = INPUT_NODE;
+
+        new_node->name = vtr::strdup(name_str);
+
+        allocate_more_output_pins(new_node, 1);
+        add_output_port_information(new_node, 1);
+
+        npin_t *new_pin = allocate_npin();
+        new_pin->name = vtr::strdup(name_str);
+        new_pin->type = OUTPUT;
+
+        add_output_pin_to_node(new_node, new_pin, 0);
+
+        nnet_t *new_net = allocate_nnet();
+        new_net->name = vtr::strdup(name_str);
+
+        add_driver_pin_to_net(new_net, new_pin);
+
+        odin_netlist->top_input_nodes =
+          (nnode_t **)vtr::realloc(odin_netlist->top_input_nodes, sizeof(nnode_t *) * (odin_netlist->num_top_input_nodes + 1));
+        odin_netlist->top_input_nodes[odin_netlist->num_top_input_nodes++] = new_node;
+
+        output_nets_hash->add(name_str, new_net);
+    }
+
+    static void create_top_driver_nets(netlist_t *odin_netlist, Hashtable *output_nets_hash)
+    {
+        npin_t *new_pin;
+
+        /* ZERO net */
+        odin_netlist->zero_net = allocate_nnet();
+        odin_netlist->gnd_node = allocate_nnode(unknown_location);
+        odin_netlist->gnd_node->type = GND_NODE;
+        allocate_more_output_pins(odin_netlist->gnd_node, 1);
+        add_output_port_information(odin_netlist->gnd_node, 1);
+        new_pin = allocate_npin();
+        add_output_pin_to_node(odin_netlist->gnd_node, new_pin, 0);
+        add_driver_pin_to_net(odin_netlist->zero_net, new_pin);
+
+        /*ONE net*/
+        odin_netlist->one_net = allocate_nnet();
+        odin_netlist->vcc_node = allocate_nnode(unknown_location);
+        odin_netlist->vcc_node->type = VCC_NODE;
+        allocate_more_output_pins(odin_netlist->vcc_node, 1);
+        add_output_port_information(odin_netlist->vcc_node, 1);
+        new_pin = allocate_npin();
+        add_output_pin_to_node(odin_netlist->vcc_node, new_pin, 0);
+        add_driver_pin_to_net(odin_netlist->one_net, new_pin);
+
+        /* Pad net */
+        odin_netlist->pad_net = allocate_nnet();
+        odin_netlist->pad_node = allocate_nnode(unknown_location);
+        odin_netlist->pad_node->type = PAD_NODE;
+        allocate_more_output_pins(odin_netlist->pad_node, 1);
+        add_output_port_information(odin_netlist->pad_node, 1);
+        new_pin = allocate_npin();
+        add_output_pin_to_node(odin_netlist->pad_node, new_pin, 0);
+        add_driver_pin_to_net(odin_netlist->pad_net, new_pin);
+
+        /* CREATE the driver for the ZERO */
+        odin_netlist->zero_net->name = make_full_ref_name(odin_netlist->identifier, NULL, NULL, zero_string, -1);
+        output_nets_hash->add(GND_NAME, odin_netlist->zero_net);
+
+        /* CREATE the driver for the ONE and store twice */
+        odin_netlist->one_net->name = make_full_ref_name(odin_netlist->identifier, NULL, NULL, one_string, -1);
+        output_nets_hash->add(VCC_NAME, odin_netlist->one_net);
+
+        /* CREATE the driver for the PAD */
+        odin_netlist->pad_net->name = make_full_ref_name(odin_netlist->identifier, NULL, NULL, pad_string, -1);
+        output_nets_hash->add(HBPAD_NAME, odin_netlist->pad_net);
+
+        odin_netlist->vcc_node->name = vtr::strdup(VCC_NAME);
+        odin_netlist->gnd_node->name = vtr::strdup(GND_NAME);
+        odin_netlist->pad_node->name = vtr::strdup(HBPAD_NAME);
+    }
+
+    static char *sig_full_ref_name_sig(RTLIL::SigBit sig, pool<SigBit> &cstr_bits_seen)
+    {
+
+        cstr_bits_seen.insert(sig);
+
+        if (sig.wire == NULL) {
+            if (sig == RTLIL::State::S0)
+                return vtr::strdup(GND_NAME);
+            else if (sig == RTLIL::State::S1)
+                return vtr::strdup(VCC_NAME);
+            else
+                return vtr::strdup(HBPAD_NAME);
+        } else {
+            std::string str = RTLIL::unescape_id(sig.wire->name);
+            if (sig.wire->width == 1)
+                return make_full_ref_name(NULL, NULL, NULL, str.c_str(), -1);
+            else {
+                int idx = sig.wire->upto ? sig.wire->start_offset + sig.wire->width - sig.offset - 1 : sig.wire->start_offset + sig.offset;
+                return make_full_ref_name(NULL, NULL, NULL, str.c_str(), idx);
+            }
+        }
+    }
+
+    static void map_input_port(const RTLIL::IdString &mapping, SigSpec in_port, nnode_t *node, pool<SigBit> &cstr_bits_seen)
+    {
+
+        int base_pin_idx = node->num_input_pins;
+
+        allocate_more_input_pins(node, in_port.size());
+        add_input_port_information(node, in_port.size());
+
+        for (int i = 0; i < in_port.size(); i++) {
+
+            char *in_pin_name = sig_full_ref_name_sig(in_port[i], cstr_bits_seen);
+
+            npin_t *in_pin = allocate_npin();
+            in_pin->name = vtr::strdup(in_pin_name);
+            in_pin->mapping = vtr::strdup(RTLIL::unescape_id(mapping).c_str());
+            add_input_pin_to_node(node, in_pin, base_pin_idx + i);
+
+            vtr::free(in_pin_name);
+        }
+    }
+
+    static void map_output_port(const RTLIL::IdString &mapping, SigSpec out_port, nnode_t *node, Hashtable *output_nets_hash,
+                                pool<SigBit> &cstr_bits_seen)
+    {
+
+        int base_pin_idx = node->num_output_pins;
+
+        allocate_more_output_pins(node, out_port.size()); //?
+        add_output_port_information(node, out_port.size());
+
+        /*add name information and a net(driver) for the output */
+
+        for (int i = 0; i < out_port.size(); i++) {
+            npin_t *out_pin = allocate_npin();
+            out_pin->name = NULL;
+            out_pin->mapping = vtr::strdup(RTLIL::unescape_id(mapping).c_str());
+            add_output_pin_to_node(node, out_pin, base_pin_idx + i);
+
+            char *output_pin_name = sig_full_ref_name_sig(out_port[i], cstr_bits_seen);
+            nnet_t *out_net = (nnet_t *)output_nets_hash->get(output_pin_name);
+            if (out_net == nullptr) {
+                out_net = allocate_nnet();
+                out_net->name = vtr::strdup(output_pin_name);
+                output_nets_hash->add(output_pin_name, out_net);
+            }
+            add_driver_pin_to_net(out_net, out_pin);
+
+            vtr::free(output_pin_name);
+        }
+    }
+
+    static bool is_param_required(operation_list op)
+    {
+        switch (op) {
+        case (SL):
+        case (SR):
+        case (ASL):
+        case (ASR):
+        case (DLATCH):
+        case (ADLATCH):
+        case (SETCLR):
+        case (SDFF):
+        case (DFFE):
+        case (SDFFE):
+        case (SDFFCE):
+        case (DFFSR):
+        case (DFFSRE):
+        case (SPRAM):
+        case (DPRAM):
+        case (YMEM):
+        case (YMEM2):
+        case (FF_NODE):
+            return true;
+        default:
+            return false;
+        }
+    }
+
+    static operation_list from_yosys_type(RTLIL::IdString type)
+    {
+        if (type == ID($add)) {
+            return ADD;
+        }
+        if (type == ID($mem)) {
+            return YMEM;
+        }
+        if (type == ID($mem_v2)) {
+            return YMEM2;
+        }
+        if (type == ID($mul)) {
+            return MULTIPLY;
+        }
+        if (type == ID($sub)) {
+            return MINUS;
+        }
+        if (type == ID(LUT_K)) {
+            return SKIP;
+        }
+        if (type == ID(DFF)) {
+            return FF_NODE;
+        }
+        if (type == ID(fpga_interconnect)) {
+            return operation_list_END;
+        }
+        if (type == ID(mux)) {
+            return SMUX_2;
+        }
+        if (type == ID(adder)) {
+            return ADD;
+        }
+        if (type == ID(multiply)) {
+            return MULTIPLY;
+        }
+        if (type == ID(single_port_ram)) {
+            return SPRAM;
+        }
+        if (type == ID(dual_port_ram)) {
+            return DPRAM;
+        }
+
+        if (RTLIL::builtin_ff_cell_types().count(type)) {
+            return SKIP;
+        }
+
+        if (ct.cell_known(type)) {
+            return SKIP;
+        }
+
+        return NO_OP;
+    }
+
+    static netlist_t *to_netlist(RTLIL::Module *top_module, RTLIL::Design *design)
+    {
+        ct.setup();
+
+        std::vector<RTLIL::Module *> mod_list;
+
+        std::vector<RTLIL::Module *> black_list;
+
+        std::string top_module_name;
+        if (top_module_name.empty())
+            for (auto module : design->modules())
+                if (module->get_bool_attribute(ID::top))
+                    top_module_name = module->name.str();
+
+        for (auto module : design->modules()) {
+
+            if (module->processes.size() != 0) {
+                log_error("Found unmapped processes in module %s: unmapped processes are not supported in parmys pass!\n", log_id(module->name));
+            }
+
+            if (module->memories.size() != 0) {
+                log_error("Found unmapped memories in module %s: unmapped memories are not supported in parmys pass!\n", log_id(module->name));
+            }
+
+            if (module->name == RTLIL::escape_id(top_module_name)) {
+                top_module_name.clear();
+                continue;
+            }
+
+            mod_list.push_back(module);
+        }
+
+        pool<SigBit> cstr_bits_seen;
+
+        netlist_t *odin_netlist = allocate_netlist();
+        odin_netlist->design = design;
+        Hashtable *output_nets_hash = new Hashtable();
+        odin_netlist->identifier = vtr::strdup(log_id(top_module->name));
+
+        create_top_driver_nets(odin_netlist, output_nets_hash);
+
+        // build_top_input_node(DEFAULT_CLOCK_NAME, odin_netlist, output_nets_hash);
+
+        std::map<int, RTLIL::Wire *> inputs, outputs;
+
+        for (auto wire : top_module->wires()) {
+            if (wire->port_input)
+                inputs[wire->port_id] = wire;
+            if (wire->port_output)
+                outputs[wire->port_id] = wire;
+        }
+
+        for (auto &it : inputs) {
+            RTLIL::Wire *wire = it.second;
+            for (int i = 0; i < wire->width; i++) {
+                char *name_string = sig_full_ref_name_sig(RTLIL::SigBit(wire, i), cstr_bits_seen);
+                build_top_input_node(name_string, odin_netlist, output_nets_hash);
+                vtr::free(name_string);
+            }
+        }
+
+        for (auto &it : outputs) {
+            RTLIL::Wire *wire = it.second;
+            for (int i = 0; i < wire->width; i++) {
+                char *name_string = sig_full_ref_name_sig(RTLIL::SigBit(wire, i), cstr_bits_seen);
+                build_top_output_node(name_string, odin_netlist);
+                vtr::free(name_string);
+            }
+        }
+
+        long hard_id = 0;
+        for (auto cell : top_module->cells()) {
+
+            nnode_t *new_node = allocate_nnode(my_location);
+
+            for (auto &param : cell->parameters) {
+                new_node->cell_parameters[RTLIL::IdString(param.first)] = Const(param.second);
+            }
+
+            new_node->related_ast_node = NULL;
+
+            // new_node->type = yosys_subckt_strmap[str(cell->type).c_str()];
+            new_node->type = from_yosys_type(cell->type);
+
+            // check primitive node type is alreday mapped before or not (blackboxed)
+            if (new_node->type == SPRAM || new_node->type == DPRAM || new_node->type == ADD || new_node->type == MULTIPLY) {
+                if (design->module(cell->type) != nullptr && design->module(cell->type)->get_blackbox_attribute()) {
+                    new_node->type = SKIP;
+                }
+            }
+
+            if (new_node->type == NO_OP) {
+
+                /**
+                 *  according to ast.cc:1657-1663
+                 *
+                 * 	std::string modname;
+                 *	if (parameters.size() == 0)
+                 *		modname = stripped_name;
+                 *	else if (para_info.size() > 60)
+                 *		modname = "$paramod$" + sha1(para_info) + stripped_name;
+                 *	else
+                 *		modname = "$paramod" + stripped_name + para_info;
+                 */
+
+                if (cell->type.begins_with("$paramod$")) // e.g. $paramod$b509a885304d9c8c49f505bb9d0e99a9fb676562\dual_port_ram
+                {
+                    std::regex regex("^\\$paramod\\$\\w+\\\\(\\w+)$");
+                    std::smatch m;
+                    std::string modname(str(cell->type));
+                    if (regex_match(modname, m, regex)) {
+                        new_node->type = yosys_subckt_strmap[m.str(1).c_str()];
+                    }
+                } else if (cell->type.begins_with("$paramod\\")) // e.g. $paramod\dual_port_ram\ADDR_WIDTH?4'0100\DATA_WIDTH?4'0101
+                {
+                    std::regex regex("^\\$paramod\\\\(\\w+)(\\\\\\S+)*$");
+                    std::smatch m;
+                    std::string modname(str(cell->type));
+                    if (regex_match(modname, m, regex)) {
+                        new_node->type = yosys_subckt_strmap[m.str(1).c_str()];
+                    }
+                } else if (design->module(cell->type)->get_blackbox_attribute()) {
+                    new_node->type = SKIP;
+                } else {
+                    new_node->type = HARD_IP;
+                    t_model *hb_model = find_hard_block(str(cell->type).c_str());
+                    if (hb_model) {
+                        hb_model->used = 1;
+                    }
+                    std::string modname(str(cell->type));
+                    // Create a fake ast node.
+                    if (new_node->type == HARD_IP) {
+                        new_node->related_ast_node = create_node_w_type(HARD_BLOCK, my_location);
+                        new_node->related_ast_node->children = (ast_node_t **)vtr::calloc(1, sizeof(ast_node_t *));
+                        new_node->related_ast_node->identifier_node = create_tree_node_id(vtr::strdup(modname.c_str()), my_location);
+                    }
+                }
+            }
+
+            if (new_node->type == SKIP) {
+                std::string modname(str(cell->type));
+                // fake ast node.
+                new_node->related_ast_node = create_node_w_type(HARD_BLOCK, my_location);
+                new_node->related_ast_node->children = (ast_node_t **)vtr::calloc(1, sizeof(ast_node_t *));
+                new_node->related_ast_node->identifier_node = create_tree_node_id(vtr::strdup(modname.c_str()), my_location);
+            }
+
+            for (auto &conn : cell->connections()) {
+
+                if (cell->input(conn.first) && conn.second.size() > 0) {
+                    map_input_port(conn.first, conn.second, new_node, cstr_bits_seen);
+                }
+
+                if (cell->output(conn.first) && conn.second.size() > 0) {
+                    map_output_port(conn.first, conn.second, new_node, output_nets_hash, cstr_bits_seen);
+                }
+            }
+
+            if (is_param_required(new_node->type)) {
+
+                if (cell->hasParam(ID::SRST_VALUE)) {
+                    auto value = vtr::strdup(cell->getParam(ID::SRST_VALUE).as_string().c_str());
+                    new_node->attributes->sreset_value = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::ARST_VALUE)) {
+                    auto value = vtr::strdup(cell->getParam(ID::ARST_VALUE).as_string().c_str());
+                    new_node->attributes->areset_value = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::OFFSET)) {
+                    auto value = vtr::strdup(cell->getParam(ID::OFFSET).as_string().c_str());
+                    new_node->attributes->offset = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::SIZE)) {
+                    auto value = vtr::strdup(cell->getParam(ID::SIZE).as_string().c_str());
+                    new_node->attributes->size = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::WIDTH)) {
+                    auto value = vtr::strdup(cell->getParam(ID::WIDTH).as_string().c_str());
+                    new_node->attributes->DBITS = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::RD_PORTS)) {
+                    auto value = vtr::strdup(cell->getParam(ID::RD_PORTS).as_string().c_str());
+                    new_node->attributes->RD_PORTS = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::WR_PORTS)) {
+                    auto value = vtr::strdup(cell->getParam(ID::WR_PORTS).as_string().c_str());
+                    new_node->attributes->WR_PORTS = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::ABITS)) {
+                    auto value = vtr::strdup(cell->getParam(ID::ABITS).as_string().c_str());
+                    new_node->attributes->ABITS = std::bitset<sizeof(long) * 8>(value).to_ulong();
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::MEMID)) {
+                    auto value = vtr::strdup(cell->getParam(ID::MEMID).as_string().c_str());
+                    RTLIL::IdString ids = cell->getParam(ID::MEMID).decode_string();
+                    new_node->attributes->memory_id = vtr::strdup(RTLIL::unescape_id(ids).c_str());
+                    vtr::free(value);
+                }
+
+                if (cell->hasParam(ID::A_SIGNED)) {
+                    new_node->attributes->port_a_signed = cell->getParam(ID::A_SIGNED).as_bool() ? SIGNED : UNSIGNED;
+                }
+
+                if (cell->hasParam(ID::B_SIGNED)) {
+                    new_node->attributes->port_b_signed = cell->getParam(ID::B_SIGNED).as_bool() ? SIGNED : UNSIGNED;
+                }
+
+                if (cell->hasParam(ID::CLK_POLARITY)) {
+                    new_node->attributes->clk_edge_type =
+                      cell->getParam(ID::CLK_POLARITY).as_bool() ? RISING_EDGE_SENSITIVITY : FALLING_EDGE_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::CLR_POLARITY)) {
+                    new_node->attributes->clr_polarity =
+                      cell->getParam(ID::CLR_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::SET_POLARITY)) {
+                    new_node->attributes->set_polarity =
+                      cell->getParam(ID::SET_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::EN_POLARITY)) {
+                    new_node->attributes->enable_polarity =
+                      cell->getParam(ID::EN_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::ARST_POLARITY)) {
+                    new_node->attributes->areset_polarity =
+                      cell->getParam(ID::ARST_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::SRST_POLARITY)) {
+                    new_node->attributes->sreset_polarity =
+                      cell->getParam(ID::SRST_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::RD_CLK_ENABLE)) {
+                    new_node->attributes->RD_CLK_ENABLE =
+                      cell->getParam(ID::RD_CLK_ENABLE).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::WR_CLK_ENABLE)) {
+                    new_node->attributes->WR_CLK_ENABLE =
+                      cell->getParam(ID::WR_CLK_ENABLE).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::RD_CLK_POLARITY)) {
+                    new_node->attributes->RD_CLK_POLARITY =
+                      cell->getParam(ID::RD_CLK_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+
+                if (cell->hasParam(ID::WR_CLK_POLARITY)) {
+                    new_node->attributes->WR_CLK_POLARITY =
+                      cell->getParam(ID::WR_CLK_POLARITY).as_bool() ? ACTIVE_HIGH_SENSITIVITY : ACTIVE_LOW_SENSITIVITY;
+                }
+            }
+
+            if (new_node->type == SMUX_2) {
+                new_node->name = vtr::strdup(new_node->output_pins[0]->net->name);
+            } else {
+                new_node->name = vtr::strdup(
+                  stringf("%s~%ld", (((new_node->type == HARD_IP /*|| new_node->type == SKIP*/) ? "\\" : "") + str(cell->type)).c_str(), hard_id++)
+                    .c_str());
+            }
+
+            /*add this node to blif_netlist as an internal node */
+            odin_netlist->internal_nodes =
+              (nnode_t **)vtr::realloc(odin_netlist->internal_nodes, sizeof(nnode_t *) * (odin_netlist->num_internal_nodes + 1));
+            odin_netlist->internal_nodes[odin_netlist->num_internal_nodes++] = new_node;
+        }
+
+        // add intermediate buffer nodes
+        for (auto &conn : top_module->connections())
+            for (int i = 0; i < conn.first.size(); i++) {
+                SigBit lhs_bit = conn.first[i];
+                SigBit rhs_bit = conn.second[i];
+
+                if (cstr_bits_seen.count(lhs_bit) == 0)
+                    continue;
+
+                nnode_t *buf_node = allocate_nnode(my_location);
+
+                buf_node->related_ast_node = NULL;
+
+                buf_node->type = BUF_NODE;
+
+                allocate_more_input_pins(buf_node, 1);
+                add_input_port_information(buf_node, 1);
+
+                char *in_pin_name = sig_full_ref_name_sig(rhs_bit, cstr_bits_seen);
+                npin_t *in_pin = allocate_npin();
+                in_pin->name = vtr::strdup(in_pin_name);
+                in_pin->type = INPUT;
+                add_input_pin_to_node(buf_node, in_pin, 0);
+
+                vtr::free(in_pin_name);
+
+                allocate_more_output_pins(buf_node, 1);
+                add_output_port_information(buf_node, 1);
+
+                npin_t *out_pin = allocate_npin();
+                out_pin->name = NULL;
+                add_output_pin_to_node(buf_node, out_pin, 0);
+
+                char *output_pin_name = sig_full_ref_name_sig(lhs_bit, cstr_bits_seen);
+                nnet_t *out_net = (nnet_t *)output_nets_hash->get(output_pin_name);
+                if (out_net == nullptr) {
+                    out_net = allocate_nnet();
+                    out_net->name = vtr::strdup(output_pin_name);
+                    output_nets_hash->add(output_pin_name, out_net);
+                }
+                add_driver_pin_to_net(out_net, out_pin);
+
+                buf_node->name = vtr::strdup(output_pin_name);
+
+                odin_netlist->internal_nodes =
+                  (nnode_t **)vtr::realloc(odin_netlist->internal_nodes, sizeof(nnode_t *) * (odin_netlist->num_internal_nodes + 1));
+                odin_netlist->internal_nodes[odin_netlist->num_internal_nodes++] = buf_node;
+
+                vtr::free(output_pin_name);
+            }
+
+        hook_up_nets(odin_netlist, output_nets_hash);
+
+        delete output_nets_hash;
+        return odin_netlist;
+    }
+
+    void get_physical_luts(std::vector<t_pb_type *> &pb_lut_list, t_mode *mode)
+    {
+        for (int i = 0; i < mode->num_pb_type_children; i++) {
+            get_physical_luts(pb_lut_list, &mode->pb_type_children[i]);
+        }
+    }
+
+    void get_physical_luts(std::vector<t_pb_type *> &pb_lut_list, t_pb_type *pb_type)
+    {
+        if (pb_type) {
+            if (pb_type->class_type == LUT_CLASS) {
+                pb_lut_list.push_back(pb_type);
+            } else {
+                for (int i = 0; i < pb_type->num_modes; i++) {
+                    get_physical_luts(pb_lut_list, &pb_type->modes[i]);
+                }
+            }
+        }
+    }
+
+    void set_physical_lut_size(std::vector<t_logical_block_type> &logical_block_types)
+    {
+        std::vector<t_pb_type *> pb_lut_list;
+
+        for (t_logical_block_type &logical_block : logical_block_types) {
+            if (logical_block.index != EMPTY_TYPE_INDEX) {
+                get_physical_luts(pb_lut_list, logical_block.pb_type);
+            }
+        }
+        for (t_pb_type *pb_lut : pb_lut_list) {
+            if (pb_lut) {
+                if (pb_lut->num_input_pins < physical_lut_size || physical_lut_size < 1) {
+                    physical_lut_size = pb_lut->num_input_pins;
+                }
+            }
+        }
+    }
+
+    static void elaborate(netlist_t *odin_netlist)
+    {
+        double elaboration_time = wall_time();
+
+        /* Perform any initialization routines here */
+        find_hard_multipliers();
+        find_hard_adders();
+        // find_hard_adders_for_sub();
+        register_hard_blocks();
+
+        resolve_top(odin_netlist);
+
+        elaboration_time = wall_time() - elaboration_time;
+        log("\nElaboration Time: ");
+        log_time(elaboration_time);
+        log("\n--------------------------------------------------------------------\n");
+    }
+
+    static void optimization(netlist_t *odin_netlist)
+    {
+        double optimization_time = wall_time();
+
+        if (odin_netlist) {
+            /* point for all netlist optimizations. */
+            log("Performing Optimization on the Netlist\n");
+            if (hard_multipliers) {
+                /* Perform a splitting of the multipliers for hard block mults */
+                reduce_operations(odin_netlist, MULTIPLY);
+                iterate_multipliers(odin_netlist);
+                clean_multipliers();
+            }
+
+            if (block_memories_info.read_only_memory_list || block_memories_info.block_memory_list) {
+                /* Perform a hard block registration and splitting in width for Yosys generated memory blocks */
+                iterate_block_memories(odin_netlist);
+                free_block_memories();
+            }
+
+            if (single_port_rams || dual_port_rams) {
+                /* Perform a splitting of any hard block memories */
+                iterate_memories(odin_netlist);
+                free_memory_lists();
+            }
+
+            if (hard_adders) {
+                /* Perform a splitting of the adders for hard block add */
+                reduce_operations(odin_netlist, ADD);
+                iterate_adders(odin_netlist);
+                clean_adders();
+
+                /* Perform a splitting of the adders for hard block sub */
+                reduce_operations(odin_netlist, MINUS);
+                iterate_adders_for_sub(odin_netlist);
+                clean_adders_for_sub();
+            }
+        }
+
+        optimization_time = wall_time() - optimization_time;
+        log("\nOptimization Time: ");
+        log_time(optimization_time);
+        log("\n--------------------------------------------------------------------\n");
+    }
+
+    static void techmap(netlist_t *odin_netlist)
+    {
+        double techmap_time = wall_time();
+
+        if (odin_netlist) {
+            /* point where we convert netlist to FPGA or other hardware target compatible format */
+            log("Performing Partial Technology Mapping to the target device\n");
+            partial_map_top(odin_netlist);
+            mixer->perform_optimizations(odin_netlist);
+
+            /* Find any unused logic in the netlist and remove it */
+            remove_unused_logic(odin_netlist);
+        }
+
+        techmap_time = wall_time() - techmap_time;
+        log("\nTechmap Time: ");
+        log_time(techmap_time);
+        log("\n--------------------------------------------------------------------\n");
+    }
+
+    static void report(netlist_t *odin_netlist)
+    {
+
+        if (odin_netlist) {
+
+            report_mult_distribution();
+            report_add_distribution();
+            report_sub_distribution();
+
+            compute_statistics(odin_netlist, true);
+        }
+    }
+
+    static void log_time(double time) { log("%.1fms", time * 1000); }
+
+    ParMYSPass() : Pass("parmys", "ODIN_II partial mapper for Yosys") {}
+    void help() override
+    {
+        log("\n");
+        log("    -a ARCHITECTURE_FILE\n");
+        log("        VTR FPGA architecture description file (XML)\n");
+        log("\n");
+        log("    -c XML_CONFIGURATION_FILE\n");
+        log("        Configuration file\n");
+        log("\n");
+        log("    -top top_module\n");
+        log("        set the specified module as design top module\n");
+        log("\n");
+        log("    -nopass\n");
+        log("        No additional passes will be executed.\n");
+        log("\n");
+        log("    -exact_mults int_value\n");
+        log("        To enable mixing hard block and soft logic implementation of adders\n");
+        log("\n");
+        log("    -mults_ratio float_value\n");
+        log("        To enable mixing hard block and soft logic implementation of adders\n");
+        log("\n");
+        log("    -vtr_prim\n");
+        log("        loads vtr primitives as modules, if the design uses vtr prmitives then this flag is mandatory for first run\n");
+        log("\n");
+        log("    -viz\n");
+        log("        visualizes the netlist at 3 different stages: elaborated, optimized, and mapped.\n");
+        log("\n");
+    }
+    void execute(std::vector<std::string> args, RTLIL::Design *design) override
+    {
+        bool flag_arch_file = false;
+        bool flag_config_file = false;
+        bool flag_load_vtr_primitives = false;
+        bool flag_no_pass = false;
+        bool flag_visualize = false;
+        std::string arch_file_path;
+        std::string config_file_path;
+        std::string top_module_name;
+        std::string DEFAULT_OUTPUT(".");
+
+        global_args.exact_mults = -1;
+        global_args.mults_ratio = -1.0;
+
+        log_header(design, "Starting parmys pass.\n");
+
+        size_t argidx;
+        for (argidx = 1; argidx < args.size(); argidx++) {
+            if (args[argidx] == "-a" && argidx + 1 < args.size()) {
+                arch_file_path = args[++argidx];
+                flag_arch_file = true;
+                continue;
+            }
+            if (args[argidx] == "-c" && argidx + 1 < args.size()) {
+                config_file_path = args[++argidx];
+                flag_config_file = true;
+                continue;
+            }
+            if (args[argidx] == "-top" && argidx + 1 < args.size()) {
+                top_module_name = args[++argidx];
+                continue;
+            }
+            if (args[argidx] == "-vtr_prim") {
+                flag_load_vtr_primitives = true;
+                continue;
+            }
+            if (args[argidx] == "-viz") {
+                flag_visualize = true;
+                continue;
+            }
+            if (args[argidx] == "-nopass") {
+                flag_no_pass = true;
+                continue;
+            }
+            if (args[argidx] == "-exact_mults" && argidx + 1 < args.size()) {
+                global_args.exact_mults = atoi(args[++argidx].c_str());
+                continue;
+            }
+            if (args[argidx] == "-mults_ratio" && argidx + 1 < args.size()) {
+                global_args.mults_ratio = atof(args[++argidx].c_str());
+                continue;
+            }
+        }
+        extra_args(args, argidx, design);
+
+        std::vector<t_physical_tile_type> physical_tile_types;
+        std::vector<t_logical_block_type> logical_block_types;
+
+        try {
+            /* Some initialization */
+            one_string = vtr::strdup(ONE_VCC_CNS);
+            zero_string = vtr::strdup(ZERO_GND_ZERO);
+            pad_string = vtr::strdup(ZERO_PAD_ZERO);
+
+        } catch (vtr::VtrError &vtr_error) {
+            log_error("Odin failed to initialize %s with exit code%d\n", vtr_error.what(), ERROR_INITIALIZATION);
+        }
+
+        mixer = new HardSoftLogicMixer();
+        set_default_config();
+
+        if (global_args.mults_ratio >= 0.0 && global_args.mults_ratio <= 1.0) {
+            delete mixer->_opts[MULTIPLY];
+            mixer->_opts[MULTIPLY] = new MultsOpt(global_args.mults_ratio);
+        } else if (global_args.exact_mults >= 0) {
+            delete mixer->_opts[MULTIPLY];
+            mixer->_opts[MULTIPLY] = new MultsOpt(global_args.exact_mults);
+        }
+
+        configuration.coarsen = true;
+
+        /* read the confirguration file .. get options presets the config values just in case theyr'e not read in with config file */
+        if (flag_config_file) {
+            log("Reading Configuration file\n");
+            try {
+                read_config_file(config_file_path.c_str());
+            } catch (vtr::VtrError &vtr_error) {
+                log_error("Odin Failed Reading Configuration file %s with exit code%d\n", vtr_error.what(), ERROR_PARSE_CONFIG);
+            }
+        }
+
+        if (flag_arch_file) {
+            log("Architecture: %s\n", vtr::basename(arch_file_path).c_str());
+
+            log("Reading FPGA Architecture file\n");
+            try {
+                XmlReadArch(arch_file_path.c_str(), false, &Arch, physical_tile_types, logical_block_types);
+                set_physical_lut_size(logical_block_types);
+            } catch (vtr::VtrError &vtr_error) {
+                log_error("Odin Failed to load architecture file: %s with exit code%d at line: %ld\n", vtr_error.what(), ERROR_PARSE_ARCH,
+                          vtr_error.line());
+            }
+        }
+        log("Using Lut input width of: %d\n", physical_lut_size);
+
+        if (!flag_no_pass) {
+
+            if (flag_load_vtr_primitives) {
+                Pass::call(design, "read_verilog -nomem2reg +/parmys/vtr_primitives.v");
+                Pass::call(design, "setattr -mod -set keep_hierarchy 1 single_port_ram");
+                Pass::call(design, "setattr -mod -set keep_hierarchy 1 dual_port_ram");
+            }
+
+            Pass::call(design, "parmys_arch -a " + arch_file_path);
+
+            if (top_module_name.empty()) {
+                Pass::call(design, "hierarchy -check -auto-top -purge_lib");
+            } else {
+                Pass::call(design, "hierarchy -check -top " + top_module_name);
+            }
+
+            Pass::call(design, "proc -norom");
+            Pass::call(design, "fsm");
+            Pass::call(design, "opt");
+            Pass::call(design, "wreduce");
+            Pass::call(design, "memory -norom");
+            Pass::call(design, "check");
+            Pass::call(design, "flatten");
+            Pass::call(design, "opt -full");
+        }
+
+        if (design->top_module()->processes.size() != 0) {
+            log_error("Found unmapped processes in top module %s: unmapped processes are not supported in parmys pass!\n",
+                      log_id(design->top_module()->name));
+        }
+
+        if (design->top_module()->memories.size() != 0) {
+            log_error("Found unmapped memories in module %s: unmapped memories are not supported in parmys pass!\n",
+                      log_id(design->top_module()->name));
+        }
+
+        design->sort();
+
+        log("--------------------------------------------------------------------\n");
+        log("Creating Odin-II Netlist from Design\n");
+
+        std::vector<Bbox> black_boxes;
+
+        for (auto bb_module : design->modules()) {
+            if (bb_module->get_bool_attribute(ID::blackbox)) {
+
+                Bbox bb;
+
+                bb.name = str(bb_module->name);
+
+                std::map<int, RTLIL::Wire *> inputs, outputs;
+
+                for (auto wire : bb_module->wires()) {
+                    if (wire->port_input)
+                        inputs[wire->port_id] = wire;
+                    if (wire->port_output)
+                        outputs[wire->port_id] = wire;
+                }
+
+                for (auto &it : inputs) {
+                    RTLIL::Wire *wire = it.second;
+                    for (int i = 0; i < wire->width; i++)
+                        bb.inputs.push_back(str(RTLIL::SigSpec(wire, i)));
+                }
+
+                for (auto &it : outputs) {
+                    RTLIL::Wire *wire = it.second;
+                    for (int i = 0; i < wire->width; i++)
+                        bb.outputs.push_back(str(RTLIL::SigSpec(wire, i)));
+                }
+
+                black_boxes.push_back(bb);
+            }
+        }
+
+        netlist_t *transformed = to_netlist(design->top_module(), design);
+
+        double synthesis_time = wall_time();
+
+        log("--------------------------------------------------------------------\n");
+        log("High-level Synthesis Begin\n");
+
+        /* Performing elaboration for input digital circuits */
+        try {
+            elaborate(transformed);
+            log("Successful Elaboration of the design by Odin-II\n");
+            if (flag_visualize) {
+                graphVizOutputNetlist(DEFAULT_OUTPUT, "netlist.elaborated.net", 111, transformed);
+                log("Successful visualization of the elaborated netlist\n");
+            }
+        } catch (vtr::VtrError &vtr_error) {
+            log_error("Odin-II Failed to parse Verilog / load BLIF file: %s with exit code:%d \n", vtr_error.what(), ERROR_ELABORATION);
+        }
+
+        /* Performing netlist optimizations */
+        try {
+            optimization(transformed);
+            log("Successful Optimization of netlist by Odin-II\n");
+            if (flag_visualize) {
+                graphVizOutputNetlist(DEFAULT_OUTPUT, "netlist.optimized.net", 222, transformed);
+                log("Successful visualization of the optimized netlist\n");
+            }
+        } catch (vtr::VtrError &vtr_error) {
+            log_error("Odin-II Failed to perform netlist optimization %s with exit code:%d \n", vtr_error.what(), ERROR_OPTIMIZATION);
+        }
+
+        /* Performaing partial tech. map to the target device */
+        try {
+            techmap(transformed);
+            log("Successful Partial Technology Mapping by Odin-II\n");
+            if (flag_visualize) {
+                graphVizOutputNetlist(DEFAULT_OUTPUT, "netlist.mapped.net", 333, transformed);
+                log("Successful visualization of the mapped netlist\n");
+            }
+        } catch (vtr::VtrError &vtr_error) {
+            log_error("Odin-II Failed to perform partial mapping to target device %s with exit code:%d \n", vtr_error.what(), ERROR_TECHMAP);
+        }
+
+        synthesis_time = wall_time() - synthesis_time;
+
+        log("\nTotal Synthesis Time: ");
+        log_time(synthesis_time);
+        log("\n--------------------------------------------------------------------\n");
+        report(transformed);
+        log("\n--------------------------------------------------------------------\n");
+
+        log("Updating the Design\n");
+        Pass::call(design, "delete");
+
+        for (auto module : design->modules()) {
+            design->remove(module);
+        }
+
+        for (auto bb_module : black_boxes) {
+            Module *module = nullptr;
+            hashlib::dict<IdString, std::pair<int, bool>> wideports_cache;
+
+            module = new Module;
+            module->name = RTLIL::escape_id(bb_module.name);
+
+            if (design->module(module->name)) {
+                log_error("Duplicate definition of module %s!\n", log_id(module->name));
+            }
+
+            design->add(module);
+
+            for (auto b_wire : bb_module.inputs) {
+                RTLIL::Wire *wire = to_wire(b_wire, module);
+                wire->port_input = true;
+                std::pair<RTLIL::IdString, int> wp = wideports_split(RTLIL::unescape_id(b_wire));
+                if (!wp.first.empty() && wp.second >= 0) {
+                    wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                    wideports_cache[wp.first].second = true;
+                }
+            }
+
+            for (auto b_wire : bb_module.outputs) {
+                RTLIL::Wire *wire = to_wire(RTLIL::unescape_id(b_wire), module);
+                wire->port_output = true;
+                std::pair<RTLIL::IdString, int> wp = wideports_split(RTLIL::unescape_id(b_wire));
+                if (!wp.first.empty() && wp.second >= 0) {
+                    wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                    wideports_cache[wp.first].second = false;
+                }
+            }
+
+            handle_wideports_cache(&wideports_cache, module);
+
+            module->fixup_ports();
+            wideports_cache.clear();
+
+            module->attributes[ID::blackbox] = RTLIL::Const(1);
+        }
+
+        update_design(design, transformed);
+
+        if (!flag_no_pass) {
+            if (top_module_name.empty()) {
+                Pass::call(design, "hierarchy -check -auto-top -purge_lib");
+            } else {
+                Pass::call(design, "hierarchy -check -top " + top_module_name);
+            }
+        }
+
+        log("--------------------------------------------------------------------\n");
+
+        free_netlist(transformed);
+
+        if (Arch.models) {
+            free_arch(&Arch);
+            Arch.models = nullptr;
+        }
+
+        free_type_descriptors(logical_block_types);
+        free_type_descriptors(physical_tile_types);
+
+        vtr::free(transformed);
+
+        if (one_string) {
+            vtr::free(one_string);
+        }
+        if (zero_string) {
+            vtr::free(zero_string);
+        }
+        if (pad_string) {
+            vtr::free(pad_string);
+        }
+
+        log("parmys pass finished.\n");
+    }
+} ParMYSPass;
+
+PRIVATE_NAMESPACE_END
\ No newline at end of file
diff --git a/parmys-plugin/parmys_arch.cc b/parmys-plugin/parmys_arch.cc
new file mode 100644
index 000000000..d76c39e5c
--- /dev/null
+++ b/parmys-plugin/parmys_arch.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "kernel/yosys.h"
+
+#include "arch_util.h"
+#include "echo_arch.h"
+#include "odin_types.h"
+#include "parmys_utils.h"
+#include "read_xml_arch_file.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+struct ParmysArchPass : public Pass {
+
+    static void add_hb_to_design(t_model *hb, Design *design)
+    {
+        Module *module = nullptr;
+        dict<IdString, std::pair<int, bool>> wideports_cache;
+
+        module = new Module;
+        module->name = RTLIL::escape_id(hb->name);
+
+        if (design->module(module->name)) {
+            log_error("Duplicate definition of module %s!\n", log_id(module->name));
+        }
+        design->add(module);
+
+        t_model_ports *input_port = hb->inputs;
+        while (input_port) {
+            for (int i = 0; i < input_port->size; i++) {
+                std::string w_name = stringf("%s[%d]", input_port->name, i);
+                RTLIL::Wire *wire = to_wire(w_name, module);
+                wire->port_input = true;
+                std::pair<RTLIL::IdString, int> wp = wideports_split(w_name);
+                if (!wp.first.empty() && wp.second >= 0) {
+                    wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                    wideports_cache[wp.first].second = true;
+                }
+            }
+
+            input_port = input_port->next;
+        }
+
+        t_model_ports *output_port = hb->outputs;
+        while (output_port) {
+            for (int i = 0; i < output_port->size; i++) {
+                std::string w_name = stringf("%s[%d]", output_port->name, i);
+                RTLIL::Wire *wire = to_wire(w_name, module);
+                wire->port_output = true;
+                std::pair<RTLIL::IdString, int> wp = wideports_split(w_name);
+                if (!wp.first.empty() && wp.second >= 0) {
+                    wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                    wideports_cache[wp.first].second = false;
+                }
+            }
+
+            output_port = output_port->next;
+        }
+
+        handle_wideports_cache(&wideports_cache, module);
+
+        module->fixup_ports();
+        wideports_cache.clear();
+
+        module->attributes[ID::blackbox] = RTLIL::Const(1);
+    }
+
+    ParmysArchPass() : Pass("parmys_arch", "loads available hard blocks within the architecture to Yosys Design") {}
+
+    void help() override
+    {
+        log("\n");
+        log("    -a ARCHITECTURE_FILE\n");
+        log("        VTR FPGA architecture description file (XML)\n");
+    }
+
+    void execute(std::vector<std::string> args, RTLIL::Design *design) override
+    {
+        size_t argidx = 1;
+        std::string arch_file_path;
+        if (args[argidx] == "-a" && argidx + 1 < args.size()) {
+            arch_file_path = args[++argidx];
+            argidx++;
+        }
+        extra_args(args, argidx, design);
+
+        t_arch arch;
+
+        std::vector<t_physical_tile_type> physical_tile_types;
+        std::vector<t_logical_block_type> logical_block_types;
+
+        try {
+            XmlReadArch(arch_file_path.c_str(), false, &arch, physical_tile_types, logical_block_types);
+        } catch (vtr::VtrError &vtr_error) {
+            log_error("Odin Failed to load architecture file: %s with exit code %s at line: %ld\n", vtr_error.what(), "ERROR_PARSE_ARCH",
+                      vtr_error.line());
+        }
+
+        const char *arch_info_file = "arch.info";
+        EchoArch(arch_info_file, physical_tile_types, logical_block_types, &arch);
+
+        t_model *hb = arch.models;
+        while (hb) {
+            if (strcmp(hb->name, SINGLE_PORT_RAM_string) && strcmp(hb->name, DUAL_PORT_RAM_string) && strcmp(hb->name, "multiply") &&
+                strcmp(hb->name, "adder")) {
+                add_hb_to_design(hb, design);
+                log("Hard block added to the Design ---> `%s`\n", hb->name);
+            }
+
+            hb = hb->next;
+        }
+
+        // CLEAN UP
+        free_arch(&arch);
+        free_type_descriptors(physical_tile_types);
+        free_type_descriptors(logical_block_types);
+
+        log("parmys_arch pass finished.\n");
+    }
+
+} ParmysArchPass;
+
+PRIVATE_NAMESPACE_END
diff --git a/parmys-plugin/parmys_resolve.cc b/parmys-plugin/parmys_resolve.cc
new file mode 100644
index 000000000..51cb665cb
--- /dev/null
+++ b/parmys-plugin/parmys_resolve.cc
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+
+#include <string.h>
+
+#include "netlist_utils.h"
+
+#include "adder.h"
+#include "block_memory.h"
+#include "memory.h"
+#include "multiplier.h"
+#include "parmys_resolve.h"
+#include "subtractor.h"
+
+#include "vtr_util.h"
+
+USING_YOSYS_NAMESPACE
+
+void dfs_resolve(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+void resolve_node(nnode_t *node, short traverse_mark_number, netlist_t *netlist);
+
+static void resolve_arithmetic_nodes(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+static void resolve_memory_nodes(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist);
+
+static void look_for_clocks(netlist_t *netlist);
+
+void resolve_top(netlist_t *netlist)
+{
+
+    if (configuration.coarsen) {
+        init_block_memory_index();
+
+        for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+            if (netlist->top_input_nodes[i] != NULL) {
+                dfs_resolve(netlist->top_input_nodes[i], RESOLVE_DFS_VALUE, netlist);
+            }
+        }
+
+        dfs_resolve(netlist->gnd_node, RESOLVE_DFS_VALUE, netlist);
+        dfs_resolve(netlist->vcc_node, RESOLVE_DFS_VALUE, netlist);
+        dfs_resolve(netlist->pad_node, RESOLVE_DFS_VALUE, netlist);
+
+        look_for_clocks(netlist);
+
+        configuration.coarsen = false;
+    }
+}
+
+void dfs_resolve(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    if (node->traverse_visited != traverse_mark_number) {
+
+        node->traverse_visited = traverse_mark_number;
+
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->net) {
+                nnet_t *next_net = node->output_pins[i]->net;
+                if (next_net->fanout_pins) {
+                    for (int j = 0; j < next_net->num_fanout_pins; j++) {
+                        if (next_net->fanout_pins[j]) {
+                            if (next_net->fanout_pins[j]->node) {
+                                dfs_resolve(next_net->fanout_pins[j]->node, traverse_mark_number, netlist);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        resolve_node(node, traverse_mark_number, netlist);
+    }
+}
+
+void resolve_node(nnode_t *node, short traverse_number, netlist_t *netlist)
+{
+    switch (node->type) {
+    case ADD:
+    case MINUS:
+    case MULTIPLY: {
+        resolve_arithmetic_nodes(node, traverse_number, netlist);
+        break;
+    }
+    case SPRAM:
+    case DPRAM:
+    case ROM:
+    case BRAM:
+    case YMEM:
+    case YMEM2:
+    case MEMORY: {
+        resolve_memory_nodes(node, traverse_number, netlist);
+        break;
+    }
+    case GND_NODE:
+    case VCC_NODE:
+    case PAD_NODE:
+    case INPUT_NODE:
+    case OUTPUT_NODE:
+    case HARD_IP:
+    case BUF_NODE:
+    case BITWISE_NOT:
+    case BITWISE_AND:
+    case BITWISE_OR:
+    case BITWISE_NAND:
+    case BITWISE_NOR:
+    case BITWISE_XNOR:
+    case BITWISE_XOR: {
+        /* some are already resolved for this phase */
+        break;
+    }
+    case SKIP:
+        break;
+    case ADDER_FUNC:
+    case CARRY_FUNC:
+    case CLOCK_NODE:
+    case GENERIC:
+    default:
+        error_message(RESOLVE, node->loc, "node (%s: %s) should have been converted to softer version.", node->type, node->name);
+        break;
+    }
+}
+
+static void resolve_arithmetic_nodes(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    switch (node->type) {
+    case ADD: {
+        if (hard_adders) {
+            node = check_missing_ports(node, traverse_mark_number, netlist);
+        }
+
+        add_list = insert_in_vptr_list(add_list, node);
+        break;
+    }
+    case MINUS: {
+        equalize_ports_size(node, traverse_mark_number, netlist);
+
+        sub_list = insert_in_vptr_list(sub_list, node);
+        break;
+    }
+    case MULTIPLY: {
+        if (!hard_multipliers)
+            check_constant_multipication(node, traverse_mark_number, netlist);
+        else
+            check_multiplier_port_size(node);
+
+        mult_list = insert_in_vptr_list(mult_list, node);
+        break;
+    }
+    default: {
+        error_message(RESOLVE, node->loc, "The node(%s) type is not among Odin's arithmetic types [ADD, MINUS and MULTIPLY]\n", node->name);
+        break;
+    }
+    }
+}
+
+static void resolve_memory_nodes(nnode_t *node, uintptr_t traverse_mark_number, netlist_t *netlist)
+{
+    oassert(node->traverse_visited == traverse_mark_number);
+
+    switch (node->type) {
+    case SPRAM: {
+        resolve_single_port_ram(node, traverse_mark_number, netlist);
+        break;
+    }
+    case DPRAM: {
+        resolve_dual_port_ram(node, traverse_mark_number, netlist);
+        break;
+    }
+    case YMEM: {
+        resolve_ymem_node(node, traverse_mark_number, netlist);
+        break;
+    }
+    case YMEM2: {
+        resolve_ymem2_node(node, traverse_mark_number, netlist);
+        break;
+    }
+    case MEMORY: {
+        break;
+    }
+    default: {
+        error_message(RESOLVE, node->loc, "The node(%s) type (%s) is not among Odin's latch types [SPRAM, DPRAM, ROM and BRAM(RW)]\n", node->name,
+                      node->type);
+        break;
+    }
+    }
+}
+
+static void look_for_clocks(netlist_t *netlist)
+{
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        nnode_t *input_node = netlist->top_input_nodes[i];
+        if (!strcmp(input_node->name, DEFAULT_CLOCK_NAME))
+            input_node->type = CLOCK_NODE;
+    }
+
+    for (int i = 0; i < netlist->num_ff_nodes; i++) {
+        oassert(netlist->ff_nodes[i]->input_pins[1]->net->num_driver_pins == 1);
+        nnode_t *node = netlist->ff_nodes[i]->input_pins[1]->net->driver_pins[0]->node;
+
+        while (node->type == BUF_NODE)
+            node = node->input_pins[0]->net->driver_pins[0]->node;
+
+        if (node->type != CLOCK_NODE) {
+            node->type = CLOCK_NODE;
+        }
+    }
+}
diff --git a/parmys-plugin/parmys_resolve.h b/parmys-plugin/parmys_resolve.h
new file mode 100644
index 000000000..07ad9aa06
--- /dev/null
+++ b/parmys-plugin/parmys_resolve.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _PARMYS_RESOLVE_HPP_
+#define _PARMYS_RESOLVE_HPP_
+
+#define DEFAULT_CLOCK_NAME "GLOBAL_SIM_BASE_CLK"
+
+void resolve_top(netlist_t *netlist);
+
+#endif // _PARMYS_RESOLVE_HPP_
diff --git a/parmys-plugin/parmys_update.cc b/parmys-plugin/parmys_update.cc
new file mode 100644
index 000000000..ef55213c5
--- /dev/null
+++ b/parmys-plugin/parmys_update.cc
@@ -0,0 +1,551 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <string.h>
+
+#include "odin_globals.h"
+#include "odin_types.h"
+
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+#include "node_utils.h"
+
+#include "adder.h"
+#include "hard_block.h"
+#include "multiplier.h"
+
+#include "kernel/rtlil.h"
+#include "parmys_update.h"
+#include "parmys_utils.h"
+
+USING_YOSYS_NAMESPACE
+
+static void depth_first_traversal_to_design(short marker_value, Module *module, netlist_t *netlist, Design *design);
+static void depth_traverse_update_design(nnode_t *node, uintptr_t traverse_mark_number, Module *module, netlist_t *netlist, Design *design);
+static void cell_node(nnode_t *node, short /*traverse_number*/, Module *module, netlist_t *netlist, Design *design);
+
+Wire *wire_net_driver(Module *module, nnode_t *node, nnet_t *net, long driver_idx)
+{
+    oassert(driver_idx < net->num_driver_pins);
+    npin_t *driver = net->driver_pins[driver_idx];
+    std::string wire_name;
+    if (!driver->node) {
+        // Add a warning for an undriven net.
+        warning_message(NETLIST, node->loc, "Net %s driving node %s is itself undriven.", net->name, node->name);
+
+        wire_name = "$undef";
+    } else {
+        if (driver->name != NULL && ((driver->node->type == MULTIPLY) || (driver->node->type == HARD_IP) || (driver->node->type == MEMORY) ||
+                                     (driver->node->type == ADD) || (driver->node->type == MINUS) || (driver->node->type == SKIP))) {
+            wire_name = driver->name;
+        } else {
+            wire_name = driver->node->name;
+        }
+    }
+
+    return to_wire(wire_name, module);
+}
+
+Wire *wire_input_single_driver(Module *module, nnode_t *node, long pin_idx)
+{
+    oassert(pin_idx < node->num_input_pins);
+    nnet_t *net = node->input_pins[pin_idx]->net;
+    if (!net->num_driver_pins) {
+        return to_wire("$undef", module);
+    } else {
+        oassert(net->num_driver_pins == 1);
+        return wire_net_driver(module, node, net, 0);
+    }
+}
+
+Wire *wire_output_pin(Module *module, nnode_t *node)
+{
+    RTLIL::IdString wire_name(stringf("\\%s", node->name));
+    RTLIL::Wire *wire = module->wire(wire_name);
+    if (wire == nullptr)
+        wire = module->addWire(wire_name);
+
+    return wire;
+}
+
+void update_design(Design *design, netlist_t *netlist)
+{
+    RTLIL::Module *module = nullptr;
+    std::string err_reason;
+    int blif_maxnum = 0;
+
+    hashlib::dict<RTLIL::IdString, std::pair<int, bool>> wideports_cache;
+
+    module = new RTLIL::Module;
+    module->name = RTLIL::escape_id(strtok(netlist->identifier, " \t\r\n"));
+
+    if (design->module(module->name)) {
+        log_error("Duplicate definition of module %s\n", log_id(module->name));
+    }
+    design->add(module);
+
+    RTLIL::SigSpec undef;
+    undef.append(to_wire("$undef", module));
+    module->connect(RTLIL::SigSig(undef, RTLIL::State::Sx));
+    RTLIL::SigSpec vcc;
+    vcc.append(to_wire("$true", module));
+    // vcc.append(module->wire(ID($true)));
+    module->connect(RTLIL::SigSig(vcc, RTLIL::State::S1));
+    RTLIL::SigSpec gnd;
+    gnd.append(to_wire("$false", module));
+    module->connect(RTLIL::SigSig(gnd, RTLIL::State::S0));
+
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        nnode_t *top_input_node = netlist->top_input_nodes[i];
+        RTLIL::Wire *wire = to_wire(top_input_node->name, module);
+        wire->port_input = true;
+
+        std::pair<RTLIL::IdString, int> wp = wideports_split(top_input_node->name);
+        if (!wp.first.empty() && wp.second >= 0) {
+            wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+            wideports_cache[wp.first].second = true;
+        }
+    }
+
+    for (int i = 0; i < netlist->num_top_output_nodes; i++) {
+        nnode_t *top_output_node = netlist->top_output_nodes[i];
+        if (!top_output_node->input_pins[0]->net->num_driver_pins) {
+            log_warning("This output is undriven (%s) and will be removed\n", top_output_node->name);
+        } else {
+            RTLIL::Wire *wire = to_wire(top_output_node->name, module);
+            wire->port_output = true;
+
+            std::pair<RTLIL::IdString, int> wp = wideports_split(top_output_node->name);
+            if (!wp.first.empty() && wp.second >= 0) {
+                wideports_cache[wp.first].first = std::max(wideports_cache[wp.first].first, wp.second + 1);
+                wideports_cache[wp.first].second = false;
+            }
+        }
+    }
+
+    depth_first_traversal_to_design(100, module, netlist, design);
+
+    /* connect all the outputs up to the last gate */
+    for (int i = 0; i < netlist->num_top_output_nodes; i++) {
+        nnode_t *node = netlist->top_output_nodes[i];
+
+        if (node->input_pins[0]->net->num_fanout_pins > 0) {
+            nnet_t *net = node->input_pins[0]->net;
+            for (int j = 0; j < net->num_driver_pins; j++) {
+                Wire *driver_wire = wire_net_driver(module, node, net, j);
+                Wire *out_wire = to_wire(node->name, module);
+
+                RTLIL::SigSpec input_sig, output_sig;
+                input_sig.append(driver_wire);
+                output_sig.append(out_wire);
+
+                module->connect(output_sig, input_sig);
+            }
+        }
+    }
+
+    handle_wideports_cache(&wideports_cache, module);
+
+    module->fixup_ports();
+    wideports_cache.clear();
+
+    bool run_clean = true;
+    if (run_clean) {
+        Const buffer_lut(std::vector<RTLIL::State>({State::S0, State::S1}));
+        std::vector<Cell *> remove_cells;
+
+        for (auto cell : module->cells())
+            if (cell->type == ID($lut) && cell->getParam(ID::LUT) == buffer_lut) {
+                module->connect(cell->getPort(ID::Y), cell->getPort(ID::A));
+                remove_cells.push_back(cell);
+            }
+
+        for (auto cell : remove_cells)
+            module->remove(cell);
+
+        Wire *true_wire = module->wire(ID($true));
+        Wire *false_wire = module->wire(ID($false));
+        Wire *undef_wire = module->wire(ID($undef));
+
+        if (true_wire != nullptr)
+            module->rename(true_wire, stringf("$true$%d", ++blif_maxnum));
+
+        if (false_wire != nullptr)
+            module->rename(false_wire, stringf("$false$%d", ++blif_maxnum));
+
+        if (undef_wire != nullptr)
+            module->rename(undef_wire, stringf("$undef$%d", ++blif_maxnum));
+
+        blif_maxnum = 0;
+    }
+
+    add_the_blackbox_for_mults_yosys(design);
+    add_the_blackbox_for_adds_yosys(design);
+
+    output_hard_blocks_yosys(design);
+
+    module = nullptr;
+}
+
+void depth_first_traversal_to_design(short marker_value, Module *module, netlist_t *netlist, Design *design)
+{
+    if (!coarsen_cleanup) {
+        netlist->gnd_node->name = vtr::strdup("$false");
+        netlist->vcc_node->name = vtr::strdup("$true");
+        netlist->pad_node->name = vtr::strdup("$undef");
+    }
+
+    depth_traverse_update_design(netlist->gnd_node, marker_value, module, netlist, design);
+    depth_traverse_update_design(netlist->vcc_node, marker_value, module, netlist, design);
+    depth_traverse_update_design(netlist->pad_node, marker_value, module, netlist, design);
+
+    for (int i = 0; i < netlist->num_top_input_nodes; i++) {
+        if (netlist->top_input_nodes[i] != NULL) {
+            depth_traverse_update_design(netlist->top_input_nodes[i], marker_value, module, netlist, design);
+        }
+    }
+}
+
+void depth_traverse_update_design(nnode_t *node, uintptr_t traverse_mark_number, Module *module, netlist_t *netlist, Design *design)
+{
+    nnode_t *next_node;
+    nnet_t *next_net;
+
+    if (node->traverse_visited == traverse_mark_number) {
+        return;
+    } else {
+        cell_node(node, traverse_mark_number, module, netlist, design);
+
+        node->traverse_visited = traverse_mark_number;
+
+        for (int i = 0; i < node->num_output_pins; i++) {
+            if (node->output_pins[i]->net == NULL)
+                continue;
+
+            next_net = node->output_pins[i]->net;
+            for (int j = 0; j < next_net->num_fanout_pins; j++) {
+                if (next_net->fanout_pins[j] == NULL)
+                    continue;
+
+                next_node = next_net->fanout_pins[j]->node;
+                if (next_node == NULL)
+                    continue;
+
+                depth_traverse_update_design(next_node, traverse_mark_number, module, netlist, design);
+            }
+        }
+    }
+}
+
+void cell_node(nnode_t *node, short /*traverse_number*/, Module *module, netlist_t *netlist, Design *design)
+{
+    switch (node->type) {
+    case GT:
+        log_error("GT\n");
+        break;
+    case LT:
+        log_error("LT\n");
+        break;
+    case BITWISE_NOT:
+        log_error("BITWISE_NOT\n");
+        break;
+    case BUF_NODE:
+        log_error("BUF_NODE\n");
+        break;
+    case LOGICAL_OR:
+    case LOGICAL_AND:
+    case LOGICAL_NOT:
+    case LOGICAL_NOR:
+    case LOGICAL_XOR:
+    case ADDER_FUNC:
+    case CARRY_FUNC:
+    case LOGICAL_XNOR:
+        define_logical_function_yosys(node, module);
+        break;
+    case LOGICAL_NAND:
+    case LOGICAL_EQUAL:
+    case NOT_EQUAL:
+        log_error("LOGICAL_\n");
+        break;
+    case MUX_2:
+        define_MUX_function_yosys(node, module);
+        break;
+
+    case SMUX_2:
+        define_SMUX_function_yosys(node, module);
+        break;
+
+    case FF_NODE:
+        define_FF_yosys(node, module);
+        break;
+
+    case MULTIPLY:
+        oassert(hard_multipliers); /* should be soft logic! */
+        define_mult_function_yosys(node, module, design);
+        break;
+
+    case ADD:
+        oassert(hard_adders); /* should be soft logic! */
+        define_add_function_yosys(node, module, design);
+        break;
+
+    case MINUS:
+        oassert(hard_adders); /* should be soft logic! */
+        define_add_function_yosys(node, module, design);
+        break;
+
+    case MEMORY:
+    case HARD_IP:
+        cell_hard_block(node, module, netlist, design);
+        break;
+    case CLOCK_NODE:
+        log_error("CLOCK\n");
+        break;
+    case INPUT_NODE:
+    case OUTPUT_NODE:
+    case PAD_NODE:
+    case GND_NODE:
+    case VCC_NODE:
+        break;
+    case SKIP:
+        cell_hard_block(node, module, netlist, design);
+        break;
+    case BITWISE_AND:
+    case BITWISE_NAND:
+    case BITWISE_NOR:
+    case BITWISE_XNOR:
+    case BITWISE_XOR:
+    case BITWISE_OR:
+    case MULTI_PORT_MUX:
+    case SL:
+    case ASL:
+    case SR:
+    case ASR:
+    case CASE_EQUAL:
+    case CASE_NOT_EQUAL:
+    case DIVIDE:
+    case MODULO:
+    case GTE:
+    case LTE:
+    default:
+        log_error("node should have been converted to softer version.");
+        break;
+    }
+}
+
+void define_FF_yosys(nnode_t *node, Module *module)
+{
+    Wire *d = wire_input_single_driver(module, node, 0);
+    Wire *q = wire_output_pin(module, node);
+    const char *clk_edge_type_str = edge_type_blif_str(node->attributes->clk_edge_type, node->loc);
+    char *edge = vtr::strdup(clk_edge_type_str);
+    Wire *clock = wire_input_single_driver(module, node, 1);
+
+    if (clock == nullptr && edge != nullptr) {
+        edge = nullptr;
+    }
+
+    if (node->initial_value == init_value_e::_0 || node->initial_value == init_value_e::_1)
+        q->attributes[ID::init] = Const(node->initial_value, 1);
+
+    if (clock == nullptr)
+        goto no_latch_clock;
+
+    if (!strcmp(edge, "re"))
+        module->addDff(NEW_ID, clock, d, q);
+    else if (!strcmp(edge, "fe"))
+        module->addDff(NEW_ID, clock, d, q, false);
+    else if (!strcmp(edge, "ah"))
+        module->addDlatch(NEW_ID, clock, d, q);
+    else if (!strcmp(edge, "al"))
+        module->addDlatch(NEW_ID, clock, d, q, false);
+    else {
+    no_latch_clock:
+        module->addFf(NEW_ID, d, q);
+    }
+}
+
+void define_MUX_function_yosys(nnode_t *node, Module *module)
+{
+    oassert(node->num_output_pins == 1);
+    oassert(node->num_input_port_sizes == 2);
+    oassert(node->input_port_sizes[0] == node->input_port_sizes[1]);
+
+    RTLIL::SigSpec input_sig_A, input_sig_B, buf_sig_M, output_sig;
+
+    for (int i = 0; i < node->input_port_sizes[0]; i++) {
+        nnet_t *input_net = node->input_pins[i]->net;
+        Wire *driver_wire = wire_net_driver(module, node, input_net, 0);
+
+        input_sig_A.append(driver_wire);
+    }
+
+    for (int i = node->input_port_sizes[0]; i < node->num_input_pins; i++) {
+        nnet_t *input_net = node->input_pins[i]->net;
+        Wire *driver_wire = wire_net_driver(module, node, input_net, 0);
+
+        input_sig_B.append(driver_wire);
+    }
+
+    for (int i = 0; i < node->input_port_sizes[0]; i++) {
+        std::string mid_buf_name = op_node_name(BUF_NODE, node->name);
+        RTLIL::Wire *buf_wire = to_wire(mid_buf_name, module);
+        buf_sig_M.append(buf_wire);
+    }
+
+    RTLIL::Wire *out_wire = to_wire(node->name, module);
+    output_sig.append(out_wire);
+
+    IdString celltype_1 = ID($and);
+    RTLIL::Cell *cell_1 = module->addCell(NEW_ID, celltype_1);
+    cell_1->setPort(ID::A, input_sig_A);
+    cell_1->parameters[ID::A_WIDTH] = RTLIL::Const(int(node->input_port_sizes[0]));
+    cell_1->parameters[ID::A_SIGNED] = RTLIL::Const(false);
+    cell_1->setPort(ID::B, input_sig_B);
+    cell_1->parameters[ID::B_WIDTH] = RTLIL::Const(int(node->input_port_sizes[1]));
+    cell_1->parameters[ID::B_SIGNED] = RTLIL::Const(false);
+    cell_1->setPort(ID::Y, buf_sig_M);
+    cell_1->parameters[ID::Y_WIDTH] = RTLIL::Const(int(node->input_port_sizes[0]));
+
+    IdString celltype_2 = ID($reduce_or);
+    RTLIL::Cell *cell_2 = module->addCell(NEW_ID, celltype_2);
+    cell_2->setPort(ID::A, buf_sig_M);
+    cell_2->parameters[ID::A_WIDTH] = RTLIL::Const(int(node->input_port_sizes[0]));
+    cell_2->parameters[ID::A_SIGNED] = RTLIL::Const(false);
+    cell_2->setPort(ID::Y, output_sig);
+    cell_2->parameters[ID::Y_WIDTH] = RTLIL::Const(int(node->num_output_pins));
+}
+
+void define_logical_function_yosys(nnode_t *node, Module *module)
+{
+    RTLIL::SigSpec input_sig, output_sig;
+
+    for (int i = 0; i < node->num_input_pins; i++) {
+        nnet_t *input_net = node->input_pins[i]->net;
+        Wire *driver_wire = wire_net_driver(module, node, input_net, 0);
+
+        input_sig.append(driver_wire);
+    }
+
+    RTLIL::Wire *out_wire = to_wire(node->name, module);
+    output_sig.append(out_wire);
+
+    oassert(node->num_output_pins == 1);
+
+    IdString celltype;
+
+    /* print out the blif definition of this gate */
+    switch (node->type) {
+    case LOGICAL_AND: {
+        celltype = ID($reduce_and);
+        break;
+    }
+    case LOGICAL_OR: {
+        celltype = ID($reduce_or);
+        break;
+    }
+    case LOGICAL_NAND: {
+        /* generates: 0----- 1\n-0----- 1\n ... */
+        break;
+    }
+    case LOGICAL_NOT:
+    case LOGICAL_NOR: {
+        celltype = ID($logic_not);
+        break;
+    }
+    case LOGICAL_EQUAL:
+        break;
+    case ADDER_FUNC:
+        oassert(node->num_input_pins == 3);
+        celltype = ID($reduce_xor);
+        break;
+    case CARRY_FUNC:
+        oassert(node->num_input_pins == 3);
+        celltype = ID($lut);
+        break;
+    case LOGICAL_XOR: {
+        oassert(node->num_input_pins <= 3);
+        celltype = ID($reduce_xor);
+        break;
+    }
+    case NOT_EQUAL:
+    case LOGICAL_XNOR: {
+        oassert(node->num_input_pins <= 3);
+        celltype = ID($reduce_xnor);
+        break;
+    }
+    default:
+        oassert(false);
+        break;
+    }
+
+    RTLIL::Cell *cell = module->addCell(NEW_ID, celltype);
+
+    cell->setPort(ID::A, input_sig);
+    cell->setPort(ID::Y, output_sig);
+
+    if (node->type == CARRY_FUNC) {
+        cell->parameters[ID::WIDTH] = RTLIL::Const(input_sig.size());
+        cell->parameters[ID::LUT] = RTLIL::Const(RTLIL::State::Sx, 1 << input_sig.size());
+        RTLIL::Const *lutptr = NULL;
+        lutptr = &cell->parameters.at(ID::LUT);
+        for (int i = 0; i < (1 << node->num_input_pins); i++) {
+            if (i == 3 || i == 5 || i == 6 || i == 7) //"011 1\n101 1\n110 1\n111 1\n"
+                lutptr->bits.at(i) = RTLIL::State::S1;
+            else
+                lutptr->bits.at(i) = RTLIL::State::S0;
+        }
+    } else {
+        cell->parameters[ID::A_WIDTH] = RTLIL::Const(int(node->num_input_pins));
+        cell->parameters[ID::Y_WIDTH] = RTLIL::Const(int(node->num_output_pins));
+        cell->parameters[ID::A_SIGNED] = RTLIL::Const(false);
+    }
+}
+
+void define_SMUX_function_yosys(nnode_t *node, Module *module)
+{
+    RTLIL::SigSpec input_sig_A, input_sig_B, input_sig_S, output_sig;
+
+    oassert(node->num_input_pins == 3); // s a b
+
+    nnet_t *s_net = node->input_pins[0]->net;
+    Wire *s_wire = wire_net_driver(module, node, s_net, 0);
+    input_sig_S.append(s_wire);
+
+    nnet_t *a_net = node->input_pins[1]->net;
+    Wire *a_wire = wire_net_driver(module, node, a_net, 0);
+    input_sig_A.append(a_wire);
+
+    nnet_t *b_net = node->input_pins[2]->net;
+    Wire *b_wire = wire_net_driver(module, node, b_net, 0);
+    input_sig_B.append(b_wire);
+
+    oassert(node->num_output_pins == 1); // y
+    RTLIL::Wire *out_wire = to_wire(node->name, module);
+    output_sig.append(out_wire);
+
+    IdString celltype = ID($mux);
+    ;
+
+    RTLIL::Cell *cell = module->addCell(NEW_ID, celltype);
+    cell->parameters[ID::WIDTH] = RTLIL::Const(int(1));
+    cell->setPort(ID::S, input_sig_S);
+    cell->setPort(ID::A, input_sig_A);
+    cell->setPort(ID::B, input_sig_B);
+    cell->setPort(ID::Y, output_sig);
+}
diff --git a/parmys-plugin/parmys_update.h b/parmys-plugin/parmys_update.h
new file mode 100644
index 000000000..bfdcd95cc
--- /dev/null
+++ b/parmys-plugin/parmys_update.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _PARMYS_UPDATE_HPP_
+#define _PARMYS_UPDATE_HPP_
+
+#include "odin_types.h"
+
+USING_YOSYS_NAMESPACE
+
+#define DEFAULT_CLOCK_NAME "GLOBAL_SIM_BASE_CLK"
+
+void define_logical_function_yosys(nnode_t *node, Module *module);
+void update_design(Design *design, netlist_t *netlist);
+void define_MUX_function_yosys(nnode_t *node, Module *module);
+void define_SMUX_function_yosys(nnode_t *node, Module *module);
+void define_FF_yosys(nnode_t *node, Module *module);
+
+#endif //_PARMYS_UPDATE_HPP_
\ No newline at end of file
diff --git a/parmys-plugin/parmys_utils.cc b/parmys-plugin/parmys_utils.cc
new file mode 100644
index 000000000..a23a2848c
--- /dev/null
+++ b/parmys-plugin/parmys_utils.cc
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "parmys_utils.h"
+
+USING_YOSYS_NAMESPACE
+
+Wire *to_wire(std::string wire_name, Module *module)
+{
+    IdString wire_id = RTLIL::escape_id(wire_name);
+    Wire *wire = module->wire(wire_id);
+
+    if (wire == nullptr)
+        wire = module->addWire(wire_id);
+
+    return wire;
+}
+
+std::pair<RTLIL::IdString, int> wideports_split(std::string name)
+{
+    int pos = -1;
+
+    if (name.empty() || name.back() != ']')
+        goto failed;
+
+    for (int i = 0; i + 1 < GetSize(name); i++) {
+        if (name[i] == '[')
+            pos = i;
+        else if (name[i] != '-' && (name[i] < '0' || name[i] > '9'))
+            pos = -1;
+        else if (name[i] == '-' && ((i != pos + 1) || name[i + 1] == ']'))
+            pos = -1;
+        else if (i == pos + 2 && name[i] == '0' && name[i - 1] == '-')
+            pos = -1;
+        else if (i == pos + 1 && name[i] == '0' && name[i + 1] != ']')
+            pos = -1;
+    }
+
+    if (pos >= 0)
+        return std::pair<RTLIL::IdString, int>("\\" + name.substr(0, pos), atoi(name.c_str() + pos + 1));
+
+failed:
+    return std::pair<RTLIL::IdString, int>(RTLIL::IdString(), 0);
+}
+
+const std::string str(RTLIL::SigBit sig)
+{
+    // cstr_bits_seen.insert(sig);
+
+    if (sig.wire == NULL) {
+        if (sig == RTLIL::State::S0)
+            return "$false";
+        if (sig == RTLIL::State::S1)
+            return "$true";
+        return "$undef";
+    }
+
+    std::string str = RTLIL::unescape_id(sig.wire->name);
+    for (size_t i = 0; i < str.size(); i++)
+        if (str[i] == '#' || str[i] == '=' || str[i] == '<' || str[i] == '>')
+            str[i] = '?';
+
+    if (sig.wire->width != 1)
+        str += stringf("[%d]", sig.wire->upto ? sig.wire->start_offset + sig.wire->width - sig.offset - 1 : sig.wire->start_offset + sig.offset);
+
+    return str;
+}
+
+const std::string str(RTLIL::IdString id)
+{
+    std::string str = RTLIL::unescape_id(id);
+    for (size_t i = 0; i < str.size(); i++)
+        if (str[i] == '#' || str[i] == '=' || str[i] == '<' || str[i] == '>')
+            str[i] = '?';
+    return str;
+}
+
+void handle_cell_wideports_cache(hashlib::dict<RTLIL::IdString, hashlib::dict<int, SigBit>> *cell_wideports_cache, Design *design, Module *module,
+                                 Cell *cell)
+{
+    RTLIL::Module *cell_mod = design->module(cell->type);
+    for (auto &it : *cell_wideports_cache) {
+        int width = 0;
+        int offset = 0;
+        bool upto = false;
+        for (auto &b : it.second)
+            width = std::max(width, b.first + 1);
+
+        if (cell_mod) {
+            Wire *cell_port = cell_mod->wire(it.first);
+            if (cell_port && (cell_port->port_input || cell_port->port_output)) {
+                offset = cell_port->start_offset;
+                upto = cell_port->upto;
+                width = cell_port->width;
+            }
+        }
+
+        SigSpec sig;
+
+        for (int i = 0; i < width; i++) {
+            int idx = offset + (upto ? width - 1 - i : i);
+            if (it.second.count(idx))
+                sig.append(it.second.at(idx));
+            else
+                sig.append(module->addWire(NEW_ID));
+        }
+
+        cell->setPort(it.first, sig);
+    }
+}
+
+void handle_wideports_cache(hashlib::dict<RTLIL::IdString, std::pair<int, bool>> *wideports_cache, Module *module)
+{
+    for (auto &wp : *wideports_cache) {
+        auto name = wp.first;
+        int width = wp.second.first;
+        bool isinput = wp.second.second;
+
+        RTLIL::Wire *wire = module->addWire(name, width);
+        wire->port_input = isinput;
+        wire->port_output = !isinput;
+
+        for (int i = 0; i < width; i++) {
+            RTLIL::IdString other_name = name.str() + stringf("[%d]", i);
+            RTLIL::Wire *other_wire = module->wire(other_name);
+            if (other_wire) {
+                other_wire->port_input = false;
+                other_wire->port_output = false;
+                if (isinput)
+                    module->connect(other_wire, SigSpec(wire, i));
+                else
+                    module->connect(SigSpec(wire, i), other_wire);
+            }
+        }
+    }
+}
diff --git a/parmys-plugin/parmys_utils.h b/parmys-plugin/parmys_utils.h
new file mode 100644
index 000000000..c40e309c4
--- /dev/null
+++ b/parmys-plugin/parmys_utils.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _PARMYS_UTILS_HPP_
+#define _PARMYS_UTILS_HPP_
+
+#include "odin_types.h"
+
+USING_YOSYS_NAMESPACE
+
+Wire *to_wire(std::string wire_name, Module *module);
+std::pair<RTLIL::IdString, int> wideports_split(std::string name);
+const std::string str(RTLIL::SigBit sig);
+const std::string str(RTLIL::IdString id);
+void handle_cell_wideports_cache(hashlib::dict<RTLIL::IdString, hashlib::dict<int, SigBit>> *cell_wideports_cache, Design *design, Module *module,
+                                 Cell *cell);
+void handle_wideports_cache(hashlib::dict<RTLIL::IdString, std::pair<int, bool>> *wideports_cache, Module *module);
+
+#endif //_PARMYS_UTILS_HPP_
\ No newline at end of file
diff --git a/parmys-plugin/techlibs/adff2dff.v b/parmys-plugin/techlibs/adff2dff.v
new file mode 100644
index 000000000..52b6b9fcd
--- /dev/null
+++ b/parmys-plugin/techlibs/adff2dff.v
@@ -0,0 +1,48 @@
+// yosys -- Yosys Open SYnthesis Suite
+//
+// Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// SPDX-License-Identifier: ISC
+
+(* techmap_celltype = "$adff" *)
+module adff2dff (CLK, ARST, D, Q);
+	parameter WIDTH = 1;
+	parameter CLK_POLARITY = 1;
+	parameter ARST_POLARITY = 1;
+	parameter ARST_VALUE = 0;
+
+	input CLK, ARST;
+	(* force_downto *)
+	input [WIDTH-1:0] D;
+	(* force_downto *)
+	output reg [WIDTH-1:0] Q;
+	(* force_downto *)
+	reg [WIDTH-1:0] NEXT_Q;
+
+	wire [1023:0] _TECHMAP_DO_ = "proc;;";
+
+	always @*
+		if (ARST == ARST_POLARITY)
+			NEXT_Q <= ARST_VALUE;
+		else
+			NEXT_Q <= D;
+
+	if (CLK_POLARITY)
+		always @(posedge CLK)
+			Q <= NEXT_Q;
+	else
+		always @(negedge CLK)
+			Q <= NEXT_Q;
+endmodule
diff --git a/parmys-plugin/techlibs/adffe2dff.v b/parmys-plugin/techlibs/adffe2dff.v
new file mode 100644
index 000000000..07f3ce474
--- /dev/null
+++ b/parmys-plugin/techlibs/adffe2dff.v
@@ -0,0 +1,55 @@
+// yosys -- Yosys Open SYnthesis Suite
+//
+// Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+// Copyright (C) 2022  CAS—Atlantic (University of New Brunswick, CASA)
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Modified version of adff2dff for addfe
+//
+// SPDX-License-Identifier: ISC
+
+(* techmap_celltype = "$adffe" *)
+module adffe2dff (CLK, ARST, EN, D, Q);
+	parameter WIDTH = 1;
+	parameter CLK_POLARITY = 1;
+	parameter ARST_POLARITY = 1;
+	parameter EN_POLARITY = 1;
+	parameter ARST_VALUE = 0;
+
+	input CLK, ARST, EN;
+	(* force_downto *)
+	input [WIDTH-1:0] D;
+	(* force_downto *)
+	output reg [WIDTH-1:0] Q;
+	(* force_downto *)
+	reg [WIDTH-1:0] NEXT_Q;
+
+	wire [1023:0] _TECHMAP_DO_ = "proc;;";
+
+	always @*
+		if (ARST == ARST_POLARITY)
+			NEXT_Q <= ARST_VALUE;
+		else
+			NEXT_Q <= D;
+
+	if (CLK_POLARITY) begin
+		always @(posedge CLK)
+			if (EN == EN_POLARITY)
+				Q <= NEXT_Q;
+	end else begin
+		always @(negedge CLK)
+			if (EN == EN_POLARITY)
+				Q <= NEXT_Q;
+	end
+endmodule
diff --git a/parmys-plugin/techlibs/aldff2dff.v b/parmys-plugin/techlibs/aldff2dff.v
new file mode 100644
index 000000000..b7ad1fc59
--- /dev/null
+++ b/parmys-plugin/techlibs/aldff2dff.v
@@ -0,0 +1,52 @@
+// yosys -- Yosys Open SYnthesis Suite
+//
+// Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+// Copyright (C) 2022  CAS—Atlantic (University of New Brunswick, CASA)
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Modified version of adff2dff for alddf
+//
+// SPDX-License-Identifier: ISC
+
+(* techmap_celltype = "$aldff" *)
+module aldff2dff (CLK, ALOAD, AD, D, Q);
+	parameter WIDTH = 1;
+	parameter CLK_POLARITY = 1;
+	parameter ALOAD_POLARITY = 1;
+
+	input CLK, ALOAD;
+    (* force_downto *)
+	input [WIDTH-1:0] AD;
+	(* force_downto *)
+	input [WIDTH-1:0] D;
+	(* force_downto *)
+	output reg [WIDTH-1:0] Q;
+	(* force_downto *)
+	reg [WIDTH-1:0] NEXT_Q;
+
+	wire [1023:0] _TECHMAP_DO_ = "proc;;";
+
+	always @*
+		if (ALOAD == ALOAD_POLARITY)
+			NEXT_Q <= AD;
+		else
+			NEXT_Q <= D;
+
+	if (CLK_POLARITY)
+		always @(posedge CLK)
+			Q <= NEXT_Q;
+	else
+		always @(negedge CLK)
+			Q <= NEXT_Q;
+endmodule
diff --git a/parmys-plugin/techlibs/aldffe2dff.v b/parmys-plugin/techlibs/aldffe2dff.v
new file mode 100644
index 000000000..c0d0cf0de
--- /dev/null
+++ b/parmys-plugin/techlibs/aldffe2dff.v
@@ -0,0 +1,56 @@
+// yosys -- Yosys Open SYnthesis Suite
+//
+// Copyright (C) 2012  Claire Xenia Wolf <claire@yosyshq.com>
+// Copyright (C) 2022  CAS—Atlantic (University of New Brunswick, CASA)
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// Modified version of adff2dff for alddfe
+//
+// SPDX-License-Identifier: ISC
+
+(* techmap_celltype = "$aldffe" *)
+module aldffe2dff (CLK, ALOAD, AD, D, EN, Q);
+	parameter WIDTH = 1;
+	parameter CLK_POLARITY = 1;
+	parameter ALOAD_POLARITY = 1;
+	parameter EN_POLARITY = 1;
+
+	input CLK, ALOAD, EN;
+    (* force_downto *)
+	input [WIDTH-1:0] AD;
+	(* force_downto *)
+	input [WIDTH-1:0] D;
+	(* force_downto *)
+	output reg [WIDTH-1:0] Q;
+	(* force_downto *)
+	reg [WIDTH-1:0] NEXT_Q;
+
+	wire [1023:0] _TECHMAP_DO_ = "proc;;";
+
+	always @*
+		if (ALOAD == ALOAD_POLARITY)
+			NEXT_Q <= AD;
+		else
+			NEXT_Q <= D;
+
+	if (CLK_POLARITY) begin
+		always @(posedge CLK)
+			if (EN == EN_POLARITY)
+				Q <= NEXT_Q;
+	end else begin
+		always @(negedge CLK)
+			if (EN == EN_POLARITY)
+				Q <= NEXT_Q;
+	end
+endmodule
diff --git a/parmys-plugin/techlibs/vtr_primitives.v b/parmys-plugin/techlibs/vtr_primitives.v
new file mode 120000
index 000000000..7c9507e8e
--- /dev/null
+++ b/parmys-plugin/techlibs/vtr_primitives.v
@@ -0,0 +1 @@
+../../third_party/vtr_flow/primitives.v
\ No newline at end of file
diff --git a/parmys-plugin/tests/Makefile b/parmys-plugin/tests/Makefile
new file mode 100644
index 000000000..2555d9fce
--- /dev/null
+++ b/parmys-plugin/tests/Makefile
@@ -0,0 +1,32 @@
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+TESTS = raygentop \
+        eltwise_layer \
+		VexRiscv_Lite \
+		mips32r1_core
+        
+include $(shell pwd)/../../Makefile_test.common
+
+raygentop_verify = true
+eltwise_layer_verify = true
+VexRiscv_Lite_verify = true
+mips32r1_core_verify = true
+
+clean_modules:
+	@find . -name "*.net.dot" -or -name "*.yosys.blif" -or -name "*.info" | xargs rm -rf
+
+clean: clean_modules
diff --git a/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.tcl b/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.tcl
new file mode 100644
index 000000000..4b026f886
--- /dev/null
+++ b/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.tcl
@@ -0,0 +1,95 @@
+yosys -import
+
+plugin -i parmys
+
+yosys -import
+
+read_verilog -nomem2reg +/parmys/vtr_primitives.v
+
+setattr -mod -set keep_hierarchy 1 single_port_ram
+
+setattr -mod -set keep_hierarchy 1 dual_port_ram
+
+
+puts "Using parmys as partial mapper"
+
+
+parmys_arch -a k6_frac_N10_frac_chain_mem32K_40nm.xml
+
+
+read_verilog -sv -nolatches $::env(DESIGN_TOP).v
+
+
+# Check that there are no combinational loops
+
+scc -select
+
+select -assert-none %
+
+select -clear
+
+
+hierarchy -check -auto-top -purge_lib
+
+
+opt_expr
+
+opt_clean
+
+check
+
+opt -nodffe -nosdff
+
+procs -norom
+
+fsm
+
+opt
+
+wreduce
+
+peepopt
+
+opt_clean
+
+share
+
+opt -full
+
+memory -nomap
+
+flatten
+
+opt -full
+
+techmap -map +/parmys/adff2dff.v
+
+techmap -map +/parmys/adffe2dff.v
+
+techmap -map +/parmys/aldff2dff.v
+
+techmap -map +/parmys/aldffe2dff.v
+
+opt -full
+
+parmys -a k6_frac_N10_frac_chain_mem32K_40nm.xml -nopass -c odin_config.xml
+
+opt -full
+
+techmap 
+
+opt -fast
+
+dffunmap
+
+opt -fast -noff
+
+
+tee -o /dev/stdout stat
+
+hierarchy -check -auto-top -purge_lib
+
+check -assert
+
+write_blif -true + vcc -false + gnd -undef + unconn -blackbox $::env(DESIGN_TOP).yosys.blif
+
diff --git a/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.v b/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.v
new file mode 120000
index 000000000..ede3e75e3
--- /dev/null
+++ b/parmys-plugin/tests/VexRiscv_Lite/VexRiscv_Lite.v
@@ -0,0 +1 @@
+../../../third_party/VexRiscv_Lite/VexRiscv_Lite.v
\ No newline at end of file
diff --git a/parmys-plugin/tests/VexRiscv_Lite/k6_frac_N10_frac_chain_mem32K_40nm.xml b/parmys-plugin/tests/VexRiscv_Lite/k6_frac_N10_frac_chain_mem32K_40nm.xml
new file mode 120000
index 000000000..f8c1cb9d4
--- /dev/null
+++ b/parmys-plugin/tests/VexRiscv_Lite/k6_frac_N10_frac_chain_mem32K_40nm.xml
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
\ No newline at end of file
diff --git a/parmys-plugin/tests/VexRiscv_Lite/odin_config.xml b/parmys-plugin/tests/VexRiscv_Lite/odin_config.xml
new file mode 100644
index 000000000..472ee2cac
--- /dev/null
+++ b/parmys-plugin/tests/VexRiscv_Lite/odin_config.xml
@@ -0,0 +1,40 @@
+<!--
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+-->
+<config>
+	<inputs>
+		<input_type>Verilog</input_type>
+		<input_path_and_name>raygentop.v</input_path_and_name>
+	</inputs>
+	<output>
+		<output_type>blif</output_type>
+		<output_path_and_name>raygentop.yosys.blif</output_path_and_name>
+		<target>
+			<arch_file>k6_frac_N10_frac_chain_mem32K_40nm.xml</arch_file>
+		</target>
+	</output>
+	<optimizations>
+		<multiply size="3" fixed="1" fracture="0" padding="-1" />
+		<memory split_memory_width="1" split_memory_depth="15" />
+		<adder size="0" threshold_size="1" />
+	</optimizations>
+	<debug_outputs>
+		<debug_output_path>.</debug_output_path>
+		<output_ast_graphs>0</output_ast_graphs>
+		<output_netlist_graphs>0</output_netlist_graphs>
+	</debug_outputs>
+</config>
\ No newline at end of file
diff --git a/parmys-plugin/tests/eltwise_layer/eltwise_layer.tcl b/parmys-plugin/tests/eltwise_layer/eltwise_layer.tcl
new file mode 100644
index 000000000..832f287c6
--- /dev/null
+++ b/parmys-plugin/tests/eltwise_layer/eltwise_layer.tcl
@@ -0,0 +1,87 @@
+yosys -import
+
+plugin -i parmys
+
+yosys -import
+
+read_verilog -nomem2reg +/parmys/vtr_primitives.v
+
+setattr -mod -set keep_hierarchy 1 single_port_ram
+
+setattr -mod -set keep_hierarchy 1 dual_port_ram
+
+puts "Using parmys as partial mapper"
+
+parmys_arch -a k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
+
+read_verilog -sv -nolatches hard_block_include.v $::env(DESIGN_TOP).v
+
+
+# Check that there are no combinational loops
+
+scc -select
+
+select -assert-none %
+
+select -clear
+
+hierarchy -check -auto-top -purge_lib
+
+opt_expr
+
+opt_clean
+
+check
+
+opt -nodffe -nosdff
+
+procs -norom
+
+fsm
+
+opt
+
+wreduce
+
+peepopt
+
+opt_clean
+
+share
+
+opt -full
+
+memory -nomap
+
+flatten
+
+opt -full
+
+techmap -map +/parmys/adff2dff.v
+
+techmap -map +/parmys/adffe2dff.v
+
+techmap -map +/parmys/aldff2dff.v
+
+techmap -map +/parmys/aldffe2dff.v
+
+opt -full
+
+parmys -a k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml -nopass -c odin_config.xml
+
+opt -full
+
+techmap 
+
+opt -fast
+
+dffunmap
+
+opt -fast -noff
+
+tee -o /dev/stdout stat
+
+hierarchy -check -auto-top -purge_lib
+
+write_blif -true + vcc -false + gnd -undef + unconn -blackbox $::env(DESIGN_TOP).yosys.blif
+
diff --git a/parmys-plugin/tests/eltwise_layer/eltwise_layer.v b/parmys-plugin/tests/eltwise_layer/eltwise_layer.v
new file mode 120000
index 000000000..0223b89b0
--- /dev/null
+++ b/parmys-plugin/tests/eltwise_layer/eltwise_layer.v
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/verilog/eltwise_layer.v
\ No newline at end of file
diff --git a/parmys-plugin/tests/eltwise_layer/hard_block_include.v b/parmys-plugin/tests/eltwise_layer/hard_block_include.v
new file mode 120000
index 000000000..f3f32f627
--- /dev/null
+++ b/parmys-plugin/tests/eltwise_layer/hard_block_include.v
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/verilog/hard_block_include.v
\ No newline at end of file
diff --git a/parmys-plugin/tests/eltwise_layer/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml b/parmys-plugin/tests/eltwise_layer/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
new file mode 120000
index 000000000..34b35f896
--- /dev/null
+++ b/parmys-plugin/tests/eltwise_layer/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/arch/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
\ No newline at end of file
diff --git a/parmys-plugin/tests/eltwise_layer/odin_config.xml b/parmys-plugin/tests/eltwise_layer/odin_config.xml
new file mode 100644
index 000000000..a9612d67d
--- /dev/null
+++ b/parmys-plugin/tests/eltwise_layer/odin_config.xml
@@ -0,0 +1,41 @@
+<!--
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+-->
+<config>
+	<inputs>
+		<input_type>Verilog</input_type>
+		<input_path_and_name>hard_block_include.v</input_path_and_name>
+		<input_path_and_name>eltwise_layer.v</input_path_and_name>
+	</inputs>
+	<output>
+		<output_type>blif</output_type>
+		<output_path_and_name>eltwise_layer.yosys.blif</output_path_and_name>
+		<target>
+			<arch_file>k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml</arch_file>
+		</target>
+	</output>
+	<optimizations>
+		<multiply size="3" fixed="1" fracture="0" padding="-1" />
+		<memory split_memory_width="1" split_memory_depth="11" />
+		<adder size="0" threshold_size="1" />
+	</optimizations>
+	<debug_outputs>
+		<debug_output_path>.</debug_output_path>
+		<output_ast_graphs>0</output_ast_graphs>
+		<output_netlist_graphs>0</output_netlist_graphs>
+	</debug_outputs>
+</config>
\ No newline at end of file
diff --git a/parmys-plugin/tests/mips32r1_core/k6_frac_N10_frac_chain_mem32K_40nm.xml b/parmys-plugin/tests/mips32r1_core/k6_frac_N10_frac_chain_mem32K_40nm.xml
new file mode 120000
index 000000000..f8c1cb9d4
--- /dev/null
+++ b/parmys-plugin/tests/mips32r1_core/k6_frac_N10_frac_chain_mem32K_40nm.xml
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
\ No newline at end of file
diff --git a/parmys-plugin/tests/mips32r1_core/mips32r1_core.tcl b/parmys-plugin/tests/mips32r1_core/mips32r1_core.tcl
new file mode 100644
index 000000000..4b026f886
--- /dev/null
+++ b/parmys-plugin/tests/mips32r1_core/mips32r1_core.tcl
@@ -0,0 +1,95 @@
+yosys -import
+
+plugin -i parmys
+
+yosys -import
+
+read_verilog -nomem2reg +/parmys/vtr_primitives.v
+
+setattr -mod -set keep_hierarchy 1 single_port_ram
+
+setattr -mod -set keep_hierarchy 1 dual_port_ram
+
+
+puts "Using parmys as partial mapper"
+
+
+parmys_arch -a k6_frac_N10_frac_chain_mem32K_40nm.xml
+
+
+read_verilog -sv -nolatches $::env(DESIGN_TOP).v
+
+
+# Check that there are no combinational loops
+
+scc -select
+
+select -assert-none %
+
+select -clear
+
+
+hierarchy -check -auto-top -purge_lib
+
+
+opt_expr
+
+opt_clean
+
+check
+
+opt -nodffe -nosdff
+
+procs -norom
+
+fsm
+
+opt
+
+wreduce
+
+peepopt
+
+opt_clean
+
+share
+
+opt -full
+
+memory -nomap
+
+flatten
+
+opt -full
+
+techmap -map +/parmys/adff2dff.v
+
+techmap -map +/parmys/adffe2dff.v
+
+techmap -map +/parmys/aldff2dff.v
+
+techmap -map +/parmys/aldffe2dff.v
+
+opt -full
+
+parmys -a k6_frac_N10_frac_chain_mem32K_40nm.xml -nopass -c odin_config.xml
+
+opt -full
+
+techmap 
+
+opt -fast
+
+dffunmap
+
+opt -fast -noff
+
+
+tee -o /dev/stdout stat
+
+hierarchy -check -auto-top -purge_lib
+
+check -assert
+
+write_blif -true + vcc -false + gnd -undef + unconn -blackbox $::env(DESIGN_TOP).yosys.blif
+
diff --git a/parmys-plugin/tests/mips32r1_core/mips32r1_core.v b/parmys-plugin/tests/mips32r1_core/mips32r1_core.v
new file mode 100644
index 000000000..2f79fbb9e
--- /dev/null
+++ b/parmys-plugin/tests/mips32r1_core/mips32r1_core.v
@@ -0,0 +1,35 @@
+// Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+`include "../../../third_party/mips32r1_core/mips32r1/Add.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Compare.v"
+`include "../../../third_party/mips32r1_core/mips32r1/CPZero.v"
+`include "../../../third_party/mips32r1_core/mips32r1/EXMEM_Stage.v"
+`include "../../../third_party/mips32r1_core/mips32r1/IDEX_Stage.v"
+`include "../../../third_party/mips32r1_core/mips32r1/MemControl.v"
+`include "../../../third_party/mips32r1_core/mips32r1/MIPS_Parameters.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Mux4.v"
+`include "../../../third_party/mips32r1_core/mips32r1/RegisterFile.v"
+`include "../../../third_party/mips32r1_core/mips32r1/TrapDetect.v"
+`include "../../../third_party/mips32r1_core/mips32r1/ALU.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Control.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Divide.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Hazard_Detection.v"
+`include "../../../third_party/mips32r1_core/mips32r1/IFID_Stage.v"
+`include "../../../third_party/mips32r1_core/mips32r1/MEMWB_Stage.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Mux2.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Processor.v"
+`include "../../../third_party/mips32r1_core/mips32r1/Register.v"
\ No newline at end of file
diff --git a/parmys-plugin/tests/mips32r1_core/odin_config.xml b/parmys-plugin/tests/mips32r1_core/odin_config.xml
new file mode 100644
index 000000000..472ee2cac
--- /dev/null
+++ b/parmys-plugin/tests/mips32r1_core/odin_config.xml
@@ -0,0 +1,40 @@
+<!--
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+-->
+<config>
+	<inputs>
+		<input_type>Verilog</input_type>
+		<input_path_and_name>raygentop.v</input_path_and_name>
+	</inputs>
+	<output>
+		<output_type>blif</output_type>
+		<output_path_and_name>raygentop.yosys.blif</output_path_and_name>
+		<target>
+			<arch_file>k6_frac_N10_frac_chain_mem32K_40nm.xml</arch_file>
+		</target>
+	</output>
+	<optimizations>
+		<multiply size="3" fixed="1" fracture="0" padding="-1" />
+		<memory split_memory_width="1" split_memory_depth="15" />
+		<adder size="0" threshold_size="1" />
+	</optimizations>
+	<debug_outputs>
+		<debug_output_path>.</debug_output_path>
+		<output_ast_graphs>0</output_ast_graphs>
+		<output_netlist_graphs>0</output_netlist_graphs>
+	</debug_outputs>
+</config>
\ No newline at end of file
diff --git a/parmys-plugin/tests/raygentop/k6_frac_N10_frac_chain_mem32K_40nm.xml b/parmys-plugin/tests/raygentop/k6_frac_N10_frac_chain_mem32K_40nm.xml
new file mode 120000
index 000000000..f8c1cb9d4
--- /dev/null
+++ b/parmys-plugin/tests/raygentop/k6_frac_N10_frac_chain_mem32K_40nm.xml
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
\ No newline at end of file
diff --git a/parmys-plugin/tests/raygentop/odin_config.xml b/parmys-plugin/tests/raygentop/odin_config.xml
new file mode 100644
index 000000000..472ee2cac
--- /dev/null
+++ b/parmys-plugin/tests/raygentop/odin_config.xml
@@ -0,0 +1,40 @@
+<!--
+# Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+-->
+<config>
+	<inputs>
+		<input_type>Verilog</input_type>
+		<input_path_and_name>raygentop.v</input_path_and_name>
+	</inputs>
+	<output>
+		<output_type>blif</output_type>
+		<output_path_and_name>raygentop.yosys.blif</output_path_and_name>
+		<target>
+			<arch_file>k6_frac_N10_frac_chain_mem32K_40nm.xml</arch_file>
+		</target>
+	</output>
+	<optimizations>
+		<multiply size="3" fixed="1" fracture="0" padding="-1" />
+		<memory split_memory_width="1" split_memory_depth="15" />
+		<adder size="0" threshold_size="1" />
+	</optimizations>
+	<debug_outputs>
+		<debug_output_path>.</debug_output_path>
+		<output_ast_graphs>0</output_ast_graphs>
+		<output_netlist_graphs>0</output_netlist_graphs>
+	</debug_outputs>
+</config>
\ No newline at end of file
diff --git a/parmys-plugin/tests/raygentop/raygentop.tcl b/parmys-plugin/tests/raygentop/raygentop.tcl
new file mode 100644
index 000000000..b5463bb96
--- /dev/null
+++ b/parmys-plugin/tests/raygentop/raygentop.tcl
@@ -0,0 +1,93 @@
+yosys -import
+
+plugin -i parmys
+
+yosys -import
+
+read_verilog -nomem2reg +/parmys/vtr_primitives.v
+
+setattr -mod -set keep_hierarchy 1 single_port_ram
+
+setattr -mod -set keep_hierarchy 1 dual_port_ram
+
+
+puts "Using parmys as partial mapper"
+
+
+parmys_arch -a k6_frac_N10_frac_chain_mem32K_40nm.xml
+
+
+read_verilog -sv -nolatches $::env(DESIGN_TOP).v
+
+
+# Check that there are no combinational loops
+
+scc -select
+
+select -assert-none %
+
+select -clear
+
+
+hierarchy -check -auto-top -purge_lib
+
+
+opt_expr
+
+opt_clean
+
+check
+
+opt -nodffe -nosdff
+
+procs -norom
+
+fsm
+
+opt
+
+wreduce
+
+peepopt
+
+opt_clean
+
+share
+
+opt -full
+
+memory -nomap
+
+flatten
+
+opt -full
+
+techmap -map +/parmys/adff2dff.v
+
+techmap -map +/parmys/adffe2dff.v
+
+techmap -map +/parmys/aldff2dff.v
+
+techmap -map +/parmys/aldffe2dff.v
+
+opt -full
+
+parmys -a k6_frac_N10_frac_chain_mem32K_40nm.xml -nopass -c odin_config.xml -viz
+
+opt -full
+
+techmap 
+
+opt -fast
+
+dffunmap
+
+opt -fast -noff
+
+
+tee -o /dev/stdout stat
+
+hierarchy -check -auto-top -purge_lib
+
+write_blif -true + vcc -false + gnd -undef + unconn -blackbox $::env(DESIGN_TOP).yosys.blif
+
diff --git a/parmys-plugin/tests/raygentop/raygentop.v b/parmys-plugin/tests/raygentop/raygentop.v
new file mode 120000
index 000000000..750464717
--- /dev/null
+++ b/parmys-plugin/tests/raygentop/raygentop.v
@@ -0,0 +1 @@
+../../../third_party/vtr_flow/verilog/raygentop.v
\ No newline at end of file
diff --git a/parmys-plugin/utils/ast_util.cc b/parmys-plugin/utils/ast_util.cc
new file mode 100644
index 000000000..6eab5f864
--- /dev/null
+++ b/parmys-plugin/utils/ast_util.cc
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+#include <algorithm>
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ast_util.h"
+#include "odin_util.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+
+/*---------------------------------------------------------------------------
+ * (function: create_node_w_type)
+ *-------------------------------------------------------------------------*/
+ast_node_t *create_node_w_type(ids id, loc_t loc)
+{
+    oassert(id != NO_ID);
+
+    static long unique_count = 0;
+
+    ast_node_t *new_node;
+
+    new_node = (ast_node_t *)vtr::calloc(1, sizeof(ast_node_t));
+    oassert(new_node != NULL);
+
+    new_node->type = id;
+    new_node->children = NULL;
+    new_node->num_children = 0;
+    new_node->unique_count = unique_count++; //++count_id;
+    new_node->loc = loc;
+    new_node->far_tag = 0;
+    new_node->high_number = 0;
+    new_node->hb_port = 0;
+    new_node->net_node = 0;
+    new_node->types.vnumber = nullptr;
+    new_node->types.identifier = NULL;
+    new_node->chunk_size = 1;
+    new_node->identifier_node = NULL;
+    /* init value */
+    new_node->types.variable.initial_value = nullptr;
+    /* reset flags */
+    new_node->types.variable.is_parameter = false;
+    new_node->types.variable.is_string = false;
+    new_node->types.variable.is_localparam = false;
+    new_node->types.variable.is_defparam = false;
+    new_node->types.variable.is_port = false;
+    new_node->types.variable.is_input = false;
+    new_node->types.variable.is_output = false;
+    new_node->types.variable.is_inout = false;
+    new_node->types.variable.is_wire = false;
+    new_node->types.variable.is_reg = false;
+    new_node->types.variable.is_genvar = false;
+    new_node->types.variable.is_memory = false;
+    new_node->types.variable.signedness = UNSIGNED;
+
+    new_node->types.concat.num_bit_strings = 0;
+    new_node->types.concat.bit_strings = NULL;
+
+    return new_node;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: create_tree_node_id)
+ *-------------------------------------------------------------------------------------------*/
+ast_node_t *create_tree_node_id(char *string, loc_t loc)
+{
+    ast_node_t *new_node = create_node_w_type(IDENTIFIERS, loc);
+    new_node->types.identifier = string;
+
+    return new_node;
+}
diff --git a/parmys-plugin/utils/ast_util.h b/parmys-plugin/utils/ast_util.h
new file mode 100644
index 000000000..1f3ab2e8e
--- /dev/null
+++ b/parmys-plugin/utils/ast_util.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _AST_UTIL_H_
+#define _AST_UTIL_H_
+
+#include "odin_types.h"
+
+ast_node_t *create_node_w_type(ids id, loc_t loc);
+ast_node_t *create_tree_node_id(char *string, loc_t loc);
+
+#endif // _AST_UTIL_H_
diff --git a/parmys-plugin/utils/config_t.h b/parmys-plugin/utils/config_t.h
new file mode 100644
index 000000000..9b3ac396b
--- /dev/null
+++ b/parmys-plugin/utils/config_t.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _CONFIG_T_H_
+#define _CONFIG_T_H_
+
+#include "odin_types.h"
+#include <string>
+#include <vector>
+
+/* This is the data structure that holds config file details */
+struct config_t {
+    std::vector<std::string> list_of_file_names;
+
+    std::string debug_output_path; // path for where to output the debug outputs
+    std::string dsp_verilog;       // path for the output Verilog file including target DSPs' declaration
+    bool coarsen;                  // Specify if the input BLIF is coarse-grain
+
+    bool output_netlist_graphs; // switch that outputs netlist graphs per node for use with GraphViz tools
+
+    int min_hard_multiplier; // threshold from hard to soft logic
+    int mult_padding;        // setting how multipliers are padded to fit fixed size
+    // Flag for fixed or variable hard mult (1 or 0)
+    int fixed_hard_multiplier;
+    // Flag for splitting hard multipliers If fixed_hard_multiplier is set, this must be 1.
+    int split_hard_multiplier;
+    // 1 to split memory width down to a size of 1. 0 to split to arch width.
+    char split_memory_width;
+    // Set to a positive integer to split memory depth to that address width. 0 to split to arch width.
+    int split_memory_depth;
+
+    // Flag for fixed or variable hard mult (1 or 0)
+    int fixed_hard_adder;
+    //  Threshold from hard to soft logic
+    int min_threshold_adder;
+    // defines if the first cin of an adder/subtractor is connected to a global gnd/vdd
+    // or generated using a dummy adder with both inputs set to gnd/vdd
+    bool adder_cin_global;
+
+    // If the memory is smaller than both of these, it will be converted to soft logic.
+    int soft_logic_memory_depth_threshold;
+    int soft_logic_memory_width_threshold;
+
+    std::string arch_file; // Name of the FPGA architecture file
+    std::string tcl_file;  // TCL file to be run by yosys
+};
+
+extern config_t configuration;
+
+#endif // _CONFIG_T_H_
diff --git a/parmys-plugin/utils/enum_str.cc b/parmys-plugin/utils/enum_str.cc
new file mode 100644
index 000000000..5460ddf7b
--- /dev/null
+++ b/parmys-plugin/utils/enum_str.cc
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_types.h"
+
+const char *edge_type_e_STR[] = {
+  "UNDEFINED_SENSITIVITY",   "FALLING_EDGE_SENSITIVITY", "RISING_EDGE_SENSITIVITY",
+  "ACTIVE_HIGH_SENSITIVITY", "ACTIVE_LOW_SENSITIVITY",   "ASYNCHRONOUS_SENSITIVITY",
+};
+
+const char *_ZERO_GND_ZERO[] = {"ZERO_GND_ZERO", "ZGZ"};
+
+const char *_ONE_VCC_CNS[] = {
+  "ONE_VCC_CNS",
+  "OVC",
+};
+
+const char *_ZERO_PAD_ZERO[] = {"ZERO_PAD_ZERO", "ZPZ"};
+
+const char *ZERO_GND_ZERO = _ZERO_GND_ZERO[ODIN_STRING_TYPE];
+const char *ONE_VCC_CNS = _ONE_VCC_CNS[ODIN_STRING_TYPE];
+const char *ZERO_PAD_ZERO = _ZERO_PAD_ZERO[ODIN_STRING_TYPE];
+
+const char *SINGLE_PORT_RAM_string = "single_port_ram";
+const char *DUAL_PORT_RAM_string = "dual_port_ram";
+const char *LUTRAM_string = "lutram_ram";
+
+const char *operation_list_STR[][2] = {
+  {"NO_OP", "nOP"},
+  {"FF_NODE", "FF"},
+  {"BUF_NODE", "BUF"},
+  {"MULTI_PORT_MUX", "nMUX"}, // port 1 = control, port 2+ = mux options
+  {"INPUT_NODE", "IN"},
+  {"OUTPUT_NODE", "OUT"},
+  {"GND_NODE", "GND"},
+  {"VCC_NODE", "VCC"},
+  {"CLOCK_NODE", "CLK"},
+  {"ADD", "ADD"},             // +
+  {"MINUS", "MIN"},           // -
+  {"BITWISE_NOT", "bNOT"},    // ~
+  {"BITWISE_AND", "bAND"},    // &
+  {"BITWISE_OR", "bOR"},      // |
+  {"BITWISE_NAND", "bNAND"},  // ~&
+  {"BITWISE_NOR", "bNOR"},    // ~|
+  {"BITWISE_XNOR", "bXNOR"},  // ~^
+  {"BITWISE_XOR", "bXOR"},    // ^
+  {"LOGICAL_NOT", "lNOT"},    // !
+  {"LOGICAL_OR", "lOR"},      // ||
+  {"LOGICAL_AND", "lAND"},    // &&
+  {"LOGICAL_NAND", "lNAND"},  // No Symbol
+  {"LOGICAL_NOR", "lNOR"},    // No Symbol
+  {"LOGICAL_XNOR", "lXNOR"},  // No symbol
+  {"LOGICAL_XOR", "lXOR"},    // No Symbol
+  {"MULTIPLY", "MUL"},        // *
+  {"DIVIDE", "DIV"},          // /
+  {"MODULO", "MOD"},          // %
+  {"POWER", "POW"},           // **
+  {"LT", "LT"},               // <
+  {"GT", "GT"},               // >
+  {"LOGICAL_EQUAL", "lEQ"},   // ==
+  {"NOT_EQUAL", "lNEQ"},      // !=
+  {"LTE", "LTE"},             // <=
+  {"GTE", "GTE"},             // >=
+  {"SR", "SR"},               // >>
+  {"ASR", "ASR"},             // >>>
+  {"SL", "SL"},               // <<
+  {"ASL", "ASL"},             // <<<
+  {"CASE_EQUAL", "cEQ"},      // ===
+  {"CASE_NOT_EQUAL", "cNEQ"}, // !==
+  {"ADDER_FUNC", "ADDER"},
+  {"CARRY_FUNC", "CARRY"},
+  {"MUX_2", "MUX_2"},
+  {"BLIF_FUNCTION", "BLIFf"},
+  {"NETLIST_FUNCTION", "NETf"},
+  {"SMUX_2", "SMUX_2"}, // MUX_2 with single bit selector (no need to add not selector as the second pin)
+  {"MEMORY", "MEM"},
+  {"PAD_NODE", "PAD"},
+  {"HARD_IP", "HARD"},
+  {"GENERIC", "GEN"},   /*added for the unknown node type */
+  {"CLOG2", "CL2"},     // $clog2
+  {"UNSIGNED", "UNSG"}, // $unsigned
+  {"SIGNED", "SG"},     // $signed
+  // [START] operations to cover yosys subckt
+  {"MULTI_BIT_MUX_2", "nbMUX"},      // like MUX_2 but with n-bit input/output
+  {"MULTIPORT_nBIT_SMUX", "npbMUX"}, // n-bit input/output in multi port mux
+  {"PMUX", "pMUX"},                  // Multiplexer with many inputs using one-hot select signal
+  {"SDFF", "sDFF"},                  // data, S to reset value and output port
+  {"DFFE", "DFFe"},                  // data, enable to output port
+  {"SDFFE", "sDFFe"},                // data, synchronous reset value and enable to output port
+  {"SDFFCE", "sDFFce"},              // data, synchronous reset value and enable to reset value and output port
+  {"DFFSR", "DFFsr"},                // data, clear and set to output port
+  {"DFFSRE", "DFFsre"},              // data, clear and set with enable to output port
+  {"DLATCH", "Dlatch"},              // datato output port based on polarity without clk
+  {"ADLATCH", "aDlatch"},            // datato output port based on polarity without clk
+  {"SETCLR", "setclr"},              // set or clear an input pins
+  {"SPRAM", "spRAM"},                // representing primitive single port ram
+  {"DPRAM", "dpRAM"},                // representing primitive dual port ram
+  {"YMEM", "yRAM"},                  // representing primitive dual port ram
+  {"YMEM2", "yRAM"},                 // representing primitive dual port ram
+  {"ROM", "ROM"},
+  {"BRAM", "bRAM"},    // block of memry generated in yosys subcircuit formet blif file
+                       // [END] operations to cover yosys subckt
+  {"ERROR OOB", "OOB"} // should not reach this
+};
+
+// EDDIE: new enum value for ids to replace MEMORY from operation_t
+/* supported input/output file extensions */
+strmap<file_type_e> file_type_strmap({{"ilang", file_type_e::_ILANG},
+                                      {"verilog", file_type_e::_VERILOG},
+                                      {"verilog_header", file_type_e::_VERILOG_HEADER},
+                                      {"blif", file_type_e::_BLIF},
+                                      {"eblif", file_type_e::_EBLIF},
+                                      {"undef", file_type_e::_UNDEFINED}});
+
+/**
+ * global hashmap of yosys subckt types
+ * Technically, Yosys only outputs the following hard blocks
+ * as (.subckt) record in its output BLIF file
+ *
+ *  FIRST_ELEMENT: Yosys model names showing in a blif file
+ *  SECOND_ELEMENT: corresponding Odin-II cell type
+ *
+ * NOTE: to add support for a new type, first you would find a
+ * corresponding Odin-II cell type or create a new one matching
+ * with the new type corresponding model (.subckt) in BLIF file,
+ * and add it to the following typemap. Then, you would need to
+ * specify the model input-output order in the Odin-II BLIF Reader.
+ * At the end, a resolve_XXX_node function needs to be implemented
+ * in the BLIF Elaboration phase to make the new node compatible
+ * with the Odin-II partial mapper.
+ */
+strmap<operation_list> yosys_subckt_strmap({
+  {"$_ANDNOT_", SKIP},
+  {"$_AND_", SKIP},          // (A, B, Y)
+  {"$_AOI3_", SKIP},         // (A, B, C, Y)
+  {"$_AOI4_", SKIP},         // (A, B, C, Y)
+  {"$_BUF_", SKIP},          // (A, Y)
+  {"$_DFFE_NN0N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NN0P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NN1N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NN1P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NN_", SKIP},      // (D, C, E, Q)
+  {"$_DFFE_NP0N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NP0P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NP1N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NP1P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_NP_", SKIP},      // (D, C, E, Q)
+  {"$_DFFE_PN0N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PN0P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PN1N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PN1P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PN_", SKIP},      // (D, C, E, Q)
+  {"$_DFFE_PP0N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PP0P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PP1N_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PP1P_", SKIP},    // (D, C, R, E, Q)
+  {"$_DFFE_PP_", SKIP},      // (D, C, E, Q)
+  {"$_DFFSRE_NNNN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NNNP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NNPN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NNPP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NPNN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NPNP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NPPN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_NPPP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PNNN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PNNP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PNPN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PNPP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PPNN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PPNP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PPPN_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSRE_PPPP_", SKIP},  // (C, S, R, E, D, Q)
+  {"$_DFFSR_NNN_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_NNP_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_NPN_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_NPP_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_PNN_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_PNP_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_PPN_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFFSR_PPP_", SKIP},    // (C, S, R, D, Q)
+  {"$_DFF_NN0_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_NN1_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_NP0_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_NP1_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_N_", SKIP},        // (D, C, Q)
+  {"$_DFF_PN0_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_PN1_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_PP0_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_PP1_", SKIP},      // (D, C, R, Q)
+  {"$_DFF_P_", SKIP},        // (D, C, Q)
+  {"$_DLATCHSR_NNN_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_NNP_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_NPN_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_NPP_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_PNN_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_PNP_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_PPN_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCHSR_PPP_", SKIP}, // (E, S, R, D, Q)
+  {"$_DLATCH_NN0_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_NN1_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_NP0_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_NP1_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_N_", SKIP},     // (E, D, Q)
+  {"$_DLATCH_PN0_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_PN1_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_PP0_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_PP1_", SKIP},   // (E, R, D, Q)
+  {"$_DLATCH_P_", SKIP},     // (E, D, Q)
+  {"$_FF_", SKIP},           // (D, Q)
+  {"$_MUX16_", SKIP},        // (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, S, T, U, V, Y)
+  {"$_MUX4_", SKIP},         // (A, B, C, D, S, T, Y)
+  {"$_MUX8_", SKIP},         // (A, B, C, D, E, F, G, H, S, T, U, Y)
+  {"$_MUX_", SKIP},          // (A, B, S, Y)
+  {"$_NAND_", SKIP},         // (A, B, Y)
+  {"$_NMUX_", SKIP},         // (A, B, S, Y)
+  {"$_NOR_", SKIP},          // (A, B, Y)
+  {"$_NOT_", SKIP},          // (A, Y)
+  {"$_OAI3_", SKIP},         // (A, B, C, Y)
+  {"$_OAI4_", SKIP},         // (A, B, C, Y)
+  {"$_ORNOT_", SKIP},        // (A, B, Y)
+  {"$_OR_", SKIP},           // (A, B, Y)
+  {"$_SDFFCE_NN0N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NN0P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NN1N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NN1P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NP0N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NP0P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NP1N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_NP1P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PN0N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PN0P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PN1N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PN1P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PP0N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PP0P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PP1N_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFCE_PP1P_", SKIP},  // (D, C, R, E, Q)
+  {"$_SDFFE_NN0N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NN0P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NN1N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NN1P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NP0N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NP0P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NP1N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_NP1P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PN0N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PN0P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PN1N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PN1P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PP0N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PP0P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PP1N_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFFE_PP1P_", SKIP},   // (D, C, R, E, Q)
+  {"$_SDFF_NN0_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_NN1_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_NP0_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_NP1_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_PN0_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_PN1_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_PP0_", SKIP},     // (D, C, R, Q)
+  {"$_SDFF_PP1_", SKIP},     // (D, C, R, Q)
+  {"$_SR_NN_", SKIP},        // (S, R, Q)
+  {"$_SR_NP_", SKIP},        // (S, R, Q)
+  {"$_SR_PN_", SKIP},        // (S, R, Q)
+  {"$_SR_PP_", SKIP},        // (S, R, Q)
+  {"$_TBUF_", SKIP},         // (A, E, Y)
+  {"$_XNOR_", SKIP},         // (A, B, Y)
+  {"$_XOR_", SKIP},          // (A, B, Y)
+  {"$add", ADD},             // (A, B, Y)
+  {"$adff", SKIP},           // (CLK, ARST, D, Q)
+  {"$adffe", SKIP},          // (CLK, ARST, EN, D, Q)
+  {"$adlatch", SKIP},        // (EN, ARST, D, Q)
+  {"$allconst", SKIP},       // (Y)
+  {"$allseq", SKIP},         // (Y)
+  {"$alu", SKIP},            // (A, B, CI, BI, X, Y, CO)
+  {"$and", SKIP},            // (A, B, Y)
+  {"$anyconst", SKIP},       // (Y)
+  {"$anyseq", SKIP},         // (Y)
+  {"$assert", SKIP},         // (A, EN)
+  {"$assume", SKIP},         // (A, EN)
+  {"$concat", SKIP},         // (A, B, Y)
+  {"$cover", SKIP},          // (A, EN)
+  {"$dff", SKIP},            // (CLK, D, Q)
+  {"$dffe", SKIP},           // (CLK, EN, D, Q)
+  {"$dffsr", SKIP},          // (CLK, SET, CLR, D, Q)
+  {"$dffsre", SKIP},         // (CLK, SET, CLR, EN, D, Q)
+  {"$div", SKIP},            // (A, B, Y)
+  {"$divfloor", SKIP},       // (A, B, Y)
+  {"$dlatch", SKIP},         // (EN, D, Q)
+  {"$dlatchsr", SKIP},       // (EN, SET, CLR, D, Q)
+  {"$eq", SKIP},             // (A, B, Y)
+  {"$equiv", SKIP},          // (A, B, Y)
+  {"$eqx", SKIP},            // (A, B, Y)
+  {"$fa", SKIP},             // (A, B, C, X, Y)
+  {"$fair", SKIP},           // (A, EN)
+  {"$ff", SKIP},             // (D, Q)
+  {"$fsm", SKIP},            // (CLK, ARST, CTRL_IN, CTRL_OUT)
+  {"$ge", SKIP},             // (A, B, Y)
+  {"$gt", SKIP},             // (A, B, Y)
+  {"$initstate", SKIP},      // (Y)
+  {"$lcu", SKIP},            // (P, G, CI, CO)
+  {"$le", SKIP},             // (A, B, Y)
+  {"$live", SKIP},           // (A, EN)
+  {"$logic_and", SKIP},      // (A, B, Y)
+  {"$logic_not", SKIP},      // (A, Y)
+  {"$logic_or", SKIP},       // (A, B, Y)
+  {"$lt", SKIP},             // (A, B, Y)
+  {"$lut", SKIP},            // (A, Y)
+  {"$mem", YMEM},
+  {"$mem_v2", YMEM2},
+  {"$macc", SKIP},        // (A, B, Y)
+  {"$meminit", SKIP},     // (ADDR, DATA)
+  {"$memrd", SKIP},       // (CLK, EN, ADDR, DATA)
+  {"$memwr", SKIP},       // (CLK, EN, ADDR, DATA)
+  {"$mod", SKIP},         // (A, B, Y)
+  {"$modfloor", SKIP},    // (A, B, Y)
+  {"$mul", MULTIPLY},     // (A, B, Y)
+  {"$mux", SKIP},         // (A, B, S, Y)
+  {"$ne", SKIP},          // (A, B, Y)
+  {"$neg", SKIP},         // (A, Y)
+  {"$nex", SKIP},         // (A, B, Y)
+  {"$not", SKIP},         // (A, Y)
+  {"$or", SKIP},          // (A, B, Y)
+  {"$pmux", SKIP},        // (A, B, S, Y)
+  {"$pos", SKIP},         // (A, Y)
+  {"$pow", SKIP},         // (A, B, Y)
+  {"$reduce_and", SKIP},  // (A, Y)
+  {"$reduce_bool", SKIP}, // (A, Y)
+  {"$reduce_or", SKIP},   // (A, Y)
+  {"$reduce_xnor", SKIP}, // (A, Y)
+  {"$reduce_xor", SKIP},  // (A, Y)
+  {"$sdff", SKIP},        // (CLK, SRST, D, Q)
+  {"$sdffce", SKIP},      // (CLK, SRST, EN, D, Q)
+  {"$sdffe", SKIP},       // (CLK, SRST, EN, D, Q)
+  {"$shift", SKIP},       // (A, B, Y)
+  {"$shiftx", SKIP},      // (A, B, Y)
+  {"$shl", SKIP},         // (A, B, Y)
+  {"$shr", SKIP},         // (A, B, Y)
+  {"$slice", SKIP},       // (A, Y)
+  {"$sop", SKIP},         // (A, Y)
+  {"$specify2", SKIP},    // (EN, SRC, DST)
+  {"$specify3", SKIP},    // (EN, SRC, DST, DAT)
+  {"$specrule", SKIP},    // (EN_SRC, EN_DST, SRC, DST)
+  {"$sr", SKIP},          // (SET, CLR, Q)
+  {"$sshl", SKIP},        // (A, B, Y)
+  {"$sshr", SKIP},        // (A, B, Y)
+  {"$sub", MINUS},        // (A, B, Y)
+  {"$tribuf", SKIP},      // (A, EN, Y)
+  {"$xnor", SKIP},        // (A, B, Y)
+
+  /*********** VTR Primitive modules START ***********/
+  {"$xor", SKIP},                            // (A, B, Y)
+  {"LUT_K", SKIP},                           // (in, out)
+  {"DFF", FF_NODE},                          // (clock, D, Q)
+  {"fpga_interconnect", operation_list_END}, // (datain, dataout)
+  {"mux", SMUX_2},                           // (select, x, y, z)
+  {"adder", ADD},                            // (a, b, out)
+  {"multiply", MULTIPLY},                    // (a, b, cin, cout, sumout)
+  {"single_port_ram", SPRAM},                // (clock, addr, data, we, out)
+  {"dual_port_ram", DPRAM}                   // (clock, addr1, addr2, data1, data2, we1, we2, out1, out2)
+});
diff --git a/parmys-plugin/utils/hash_table.cc b/parmys-plugin/utils/hash_table.cc
new file mode 100644
index 000000000..102999de8
--- /dev/null
+++ b/parmys-plugin/utils/hash_table.cc
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hash_table.h"
+#include "odin_types.h"
+#include "vtr_memory.h"
+
+void Hashtable::destroy_free_items()
+{
+    for (auto kv : my_map)
+        vtr::free(kv.second);
+}
+
+void Hashtable::add(std::string key, void *item) { this->my_map.emplace(key, item); }
+
+void *Hashtable::remove(std::string key)
+{
+    void *value = NULL;
+    auto v = this->my_map.find(key);
+    if (v != this->my_map.end()) {
+        value = v->second;
+        this->my_map.erase(v);
+    }
+    return value;
+}
+
+void *Hashtable::get(std::string key)
+{
+    void *value = NULL;
+    auto v = this->my_map.find(key);
+    if (v != this->my_map.end())
+        value = v->second;
+
+    return value;
+}
+
+bool Hashtable::is_empty() { return my_map.empty(); }
diff --git a/parmys-plugin/utils/hash_table.h b/parmys-plugin/utils/hash_table.h
new file mode 100644
index 000000000..323b8d8ef
--- /dev/null
+++ b/parmys-plugin/utils/hash_table.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _HASH_TABLE_H
+#define _HASH_TABLE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string>
+#include <unordered_map>
+
+class Hashtable
+{
+  private:
+    std::unordered_map<std::string, void *> my_map;
+
+  public:
+    // Adds an item to the hashtable.
+    void add(std::string key, void *item);
+    // Removes an item from the hashtable. If the item is not present, a null pointer is returned.
+    void *remove(std::string key);
+    // Gets an item from the hashtable without removing it. If the item is not present, a null pointer is returned.
+    void *get(std::string key);
+    // Check to see if the hashtable is empty.
+    bool is_empty();
+    // calls free on each item.
+    void destroy_free_items();
+};
+
+#endif // _HASH_TABLE_H
diff --git a/parmys-plugin/utils/odin_error.cc b/parmys-plugin/utils/odin_error.cc
new file mode 100644
index 000000000..411a47c79
--- /dev/null
+++ b/parmys-plugin/utils/odin_error.cc
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_error.h"
+#include "config_t.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+std::vector<std::pair<std::string, int>> include_file_names;
+
+#define NUMBER_OF_LINES_DIGIT 5
+int delayed_errors = 0;
+const loc_t unknown_location = {-1, -1, -1};
+
+const char *odin_error_STR[] = {
+  "", "UTIL", "PARSE_ARGS", "PARSE_TO_AST", "AST", "BLIF ELABORATION", "NETLIST", "PARSE_BLIF", "OUTPUT_BLIF", "SIMULATION",
+};
+
+void verify_delayed_error(odin_error error_type)
+{
+    if (delayed_errors) {
+        error_message(error_type, unknown_location, "Parser found (%d) errors in your syntax, exiting", delayed_errors);
+    }
+}
+
+static std::string make_marker_from_str(std::string str, int column)
+{
+    std::string to_return = "";
+
+    for (size_t i = 0; i < str.size(); i++) {
+        if (column >= 0 && i >= (size_t)column) {
+            break;
+        } else if (str[i] == ' ' || str[i] == '\t') {
+            to_return += str[i];
+        } else if (column <= 0) {
+            break;
+        } else {
+            to_return += ' ';
+        }
+    }
+
+    to_return += "^~~~";
+    return to_return;
+}
+
+static std::string get_culprit_line(int line_number, const char *file)
+{
+    std::string culprit_line = "";
+    FILE *input_file = fopen(file, "r");
+    if (input_file) {
+        bool copy_characters = false;
+        int current_line_number = 0;
+
+        for (;;) {
+            int c = fgetc(input_file);
+            if (EOF == c) {
+                break;
+            } else if ('\n' == c) {
+                ++current_line_number;
+                if (line_number == current_line_number) {
+                    copy_characters = true;
+                } else if (copy_characters) {
+                    break;
+                }
+            } else if (copy_characters) {
+                culprit_line.push_back(c);
+            }
+        }
+        fclose(input_file);
+    }
+    return culprit_line;
+}
+
+static void print_culprit_line_with_context(int column, int target_line, const char *file, int num_context_lines)
+{
+    for (int curr_line = std::max(target_line - num_context_lines, 0); curr_line <= target_line + num_context_lines; curr_line++) {
+        std::string culprit_line = get_culprit_line(curr_line, file);
+        int num_printed;
+        fprintf(stderr, " %*d | %n%s\n", NUMBER_OF_LINES_DIGIT, curr_line + 1, &num_printed, culprit_line.c_str());
+        if (curr_line == target_line) {
+            fprintf(stderr, "%s\n", make_marker_from_str(culprit_line, num_printed + column).c_str());
+        }
+    }
+}
+
+void _log_message(odin_error error_type, loc_t loc, bool fatal_error, const char *function_file_name, int function_line, const char *function_name,
+                  const char *message, ...)
+{
+    fflush(stdout);
+
+    va_list ap;
+
+    if (loc.file >= 0 && (size_t)loc.file < include_file_names.size()) {
+        char *path = realpath(include_file_names[loc.file].first.c_str(), NULL);
+        fprintf(stderr, "\n%s:%d:%d: ", path, loc.line + 1, loc.col);
+        free(path);
+    }
+
+    if (!fatal_error) {
+        fprintf(stderr, "warning");
+    } else {
+        fprintf(stderr, "error");
+    }
+
+    fprintf(stderr, "[%s]: ", odin_error_STR[error_type]);
+
+    if (message != NULL) {
+        fprintf(stderr, " ");
+        va_start(ap, message);
+        vfprintf(stderr, message, ap);
+        va_end(ap);
+
+        if (message[strlen(message) - 1] != '\n')
+            fprintf(stderr, "\n");
+    }
+
+    if (loc.file >= 0 && (size_t)loc.file < include_file_names.size() && loc.line >= 0) {
+        print_culprit_line_with_context(loc.col, loc.line, include_file_names[loc.file].first.c_str(), 2);
+    }
+
+    fflush(stderr);
+    if (fatal_error) {
+        _verbose_abort(NULL, function_file_name, function_line, function_name);
+    }
+}
+
+void _verbose_abort(const char *condition_str, const char *odin_file_name, int odin_line_number, const char *odin_function_name)
+{
+    fflush(stdout);
+    fprintf(stderr, "\n%s:%d: %s: ", odin_file_name, odin_line_number, odin_function_name);
+    if (condition_str) {
+        // We are an assertion, print the condition that failed and which line it occurred on
+        fprintf(stderr, "Assertion %s failed\n", condition_str);
+        // odin_line_number-1 since __LINE__ starts from 1
+        print_culprit_line_with_context(-1, odin_line_number - 1, odin_file_name, 2);
+    } else {
+        // We are a parsing error, dont print the culprit line
+        fprintf(stderr, "Fatal error\n");
+    }
+    fflush(stderr);
+    std::abort();
+}
diff --git a/parmys-plugin/utils/odin_error.h b/parmys-plugin/utils/odin_error.h
new file mode 100644
index 000000000..6ad2681c2
--- /dev/null
+++ b/parmys-plugin/utils/odin_error.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ODIN_ERROR_H_
+#define _ODIN_ERROR_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+struct loc_t {
+    int file = -1;
+    int line = -1;
+    int col = -1;
+};
+
+enum odin_error {
+    UTIL,
+    PARSE_ARGS,
+    AST,
+    RESOLVE,
+    NETLIST,
+    PARSE_BLIF,
+};
+
+extern const char *odin_error_STR[];
+extern std::vector<std::pair<std::string, int>> include_file_names;
+extern int delayed_errors;
+extern const loc_t unknown_location;
+
+// causes an interrupt in GDB
+[[noreturn]] void _verbose_abort(const char *condition_str, const char *odin_file_name, int odin_line_number, const char *odin_function_name);
+
+#define oassert(condition)                                                                                                                           \
+    if (!bool(condition))                                                                                                                            \
+    _verbose_abort(#condition, __FILE__, __LINE__, __func__)
+
+void _log_message(odin_error error_type, loc_t loc, bool soft_error, const char *function_file_name, int function_line, const char *function_name,
+                  const char *message, ...);
+
+#define error_message(error_type, loc, message, ...)                                                                                                 \
+    _log_message(error_type, loc, true, __FILE__, __LINE__, __PRETTY_FUNCTION__, message, __VA_ARGS__)
+
+#define warning_message(error_type, loc, message, ...)                                                                                               \
+    _log_message(error_type, loc, false, __FILE__, __LINE__, __PRETTY_FUNCTION__, message, __VA_ARGS__)
+
+#define possible_error_message(error_type, loc, message, ...)                                                                                        \
+    _log_message(error_type, loc, !global_args.permissive.value(), __FILE__, __LINE__, __PRETTY_FUNCTION__, message, __VA_ARGS__)
+
+#define delayed_error_message(error_type, loc, message, ...)                                                                                         \
+    {                                                                                                                                                \
+        _log_message(error_type, loc, false, __FILE__, __LINE__, __PRETTY_FUNCTION__, message, __VA_ARGS__);                                         \
+        delayed_errors += 1;                                                                                                                         \
+    }
+
+void verify_delayed_error(odin_error error_type);
+
+#endif // _ODIN_ERROR_H_
diff --git a/parmys-plugin/utils/odin_globals.h b/parmys-plugin/utils/odin_globals.h
new file mode 100644
index 000000000..fa00ecb05
--- /dev/null
+++ b/parmys-plugin/utils/odin_globals.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ODIN_GLOBALS_H_
+#define _ODIN_GLOBALS_H_
+
+#include "config_t.h"
+#include "hard_soft_logic_mixer.h"
+#include "hash_table.h"
+#include "odin_types.h"
+#include "read_xml_arch_file.h"
+#include "string_cache.h"
+
+/**
+ * The cutoff for the number of netlist nodes.
+ * Technically, Odin-II prints statistics for
+ * netlist nodes that the total number of them
+ * is greater than this value.
+ */
+constexpr long long UNUSED_NODE_TYPE = 0;
+
+extern global_args_t global_args;
+extern config_t configuration;
+extern loc_t my_location;
+
+extern nnode_t *gnd_node;
+extern nnode_t *vcc_node;
+extern nnode_t *pad_node;
+
+extern char *one_string;
+extern char *zero_string;
+extern char *pad_string;
+
+extern t_arch Arch;
+extern short physical_lut_size;
+
+/* logic optimization mixer, once ODIN is classy, could remove that
+ * and pass as member variable
+ */
+extern HardSoftLogicMixer *mixer;
+
+/**
+ * a global var to specify the need for cleanup after
+ * receiving a coarsen BLIF file as the input.
+ */
+extern bool coarsen_cleanup;
+
+extern strmap<file_type_e> file_type_strmap;
+extern strmap<operation_list> yosys_subckt_strmap;
+
+#endif // _ODIN_GLOBALS_H_
diff --git a/parmys-plugin/utils/odin_types.h b/parmys-plugin/utils/odin_types.h
new file mode 100644
index 000000000..5f25f9cdf
--- /dev/null
+++ b/parmys-plugin/utils/odin_types.h
@@ -0,0 +1,627 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ODIN_TYPES_H_
+#define _ODIN_TYPES_H_
+
+#include "odin_error.h"
+#include "read_xml_arch_file.h"
+#include "string_cache.h"
+#include <atomic>
+#include <mutex>
+#include <stdbool.h>
+#include <string>
+#include <unordered_map>
+
+#include <stdlib.h>
+
+#include "rtl_int.hpp"
+
+#include "kernel/rtlil.h"
+
+/**
+ * to use short vs long string for output
+ */
+#define ODIN_LONG_STRING 0
+#define ODIN_SHORT_STRING 1
+
+#ifndef DEBUG_ODIN
+#define ODIN_STRING_TYPE ODIN_SHORT_STRING
+#else
+#define ODIN_STRING_TYPE ODIN_LONG_STRING
+#endif
+
+#define ODIN_STD_BITWIDTH (sizeof(long) * 8)
+
+/* buffer size for reading a directory path */
+#define READ_BUFFER_SIZE 1048576 // 1MB
+
+/* unique numbers to mark the nodes as we DFS traverse the netlist */
+#define PARTIAL_MAP_TRAVERSE_VALUE 10
+#define OUTPUT_TRAVERSE_VALUE 12
+#define COUNT_NODES 14 /* NOTE that you can't call countnodes one after the other or the mark will be incorrect */
+#define COMBO_LOOP 15
+#define COMBO_LOOP_ERROR 16
+#define GRAPH_CRUNCH 17
+#define STATS 18
+#define SEQUENTIAL_LEVELIZE 19
+#define RESOLVE_DFS_VALUE 30
+
+/* unique numbers for using void *data entries in some of the datastructures */
+#define RESET -1
+#define LEVELIZE 12
+#define ACTIVATION 13
+
+#define verify_i_o_availabilty(node, expected_input_size, expected_output_size)                                                                      \
+    passed_verify_i_o_availabilty(node, expected_input_size, expected_output_size, __FILE__, __LINE__)
+
+struct ast_node_t;
+struct nnode_t;
+struct npin_t;
+struct nnet_t;
+struct netlist_t;
+
+/* the global arguments of the software */
+struct global_args_t {
+    std::string program_name;
+    // Odin-II Root directory
+    std::string program_root;
+    // Current path Odin-II is running
+    std::string current_path;
+
+    std::string config_file;
+    std::vector<std::string> verilog_files;
+    std::string blif_file;
+    std::string output_file;
+    std::string arch_file;   // Name of the FPGA architecture file
+    std::string tcl_file;    // TCL file to be run by yosys elaborator
+    std::string elaborator;  // Name of the external elaborator tool, currently Yosys is supported, default is Odin
+    bool permissive;         // turn possible_errors into warnings
+    bool print_parse_tokens; // print the tokens as they are parsed byt the parser
+
+    std::string high_level_block; // Legacy option, no longer used
+
+    std::string top_level_module_name; // force the name of the top level module desired
+
+    bool write_netlist_as_dot;
+    bool write_ast_as_dot;
+    bool all_warnings;
+    bool show_help;
+
+    //    bool fflegalize;     // makes flip-flops rising edge sensitive
+    bool coarsen; // tells Odin-II that the input blif is coarse-grain
+                  //    bool show_yosys_log; // Show Yosys output logs into the standard output stream
+
+    std::string adder_def; // DEPRECATED
+
+    // defines if the first cin of an adder/subtractor is connected to a global gnd/vdd
+    // or generated using a dummy adder with both inputs set to gnd/vdd
+    bool adder_cin_global;
+
+    // Arguments for mixing hard and soft logic
+    int exact_mults;
+    float mults_ratio;
+};
+
+extern const char *ZERO_GND_ZERO;
+extern const char *ONE_VCC_CNS;
+extern const char *ZERO_PAD_ZERO;
+
+extern const char *SINGLE_PORT_RAM_string;
+extern const char *DUAL_PORT_RAM_string;
+extern const char *LUTRAM_string;
+
+extern const char *edge_type_e_STR[];
+extern const char *operation_list_STR[][2];
+
+template <typename T> using strmap = std::unordered_map<std::string, T>;
+
+enum file_type_e {
+    _ILANG, /* not supported yet */
+    _VERILOG,
+    _VERILOG_HEADER,
+    _BLIF,
+    _EBLIF,     /* not supported yet */
+    _UNDEFINED, /* EROOR */
+    file_type_e_END
+};
+
+enum elaborator_e { _ODIN, _YOSYS, elaborator_e_END };
+
+enum edge_type_e {
+    UNDEFINED_SENSITIVITY,
+    FALLING_EDGE_SENSITIVITY,
+    RISING_EDGE_SENSITIVITY,
+    ACTIVE_HIGH_SENSITIVITY,
+    ACTIVE_LOW_SENSITIVITY,
+    ASYNCHRONOUS_SENSITIVITY,
+    edge_type_e_END
+};
+
+enum circuit_type_e { COMBINATIONAL, SEQUENTIAL, circuit_type_e_END };
+
+enum init_value_e {
+    _0 = 0,
+    _1 = 1,
+    dont_care = 2,
+    undefined = 3,
+};
+
+/**
+ * list of Odin-II supported operation node
+ * In the synthesis flow, most operations are resolved or mapped
+ * to an operation mode that has instantiation procedure in the
+ * partial mapping. However, for techmap flow, nodes are elaborated
+ * into the partial mapper supported operations in BLIF elaboration.
+ * Technically, each Odin-II node should have one of the following
+ * operation type. To add support for a new type you would need to
+ * start from here to see how each operation mode is being resolved.
+ */
+enum operation_list {
+    NO_OP,
+    MULTI_PORT_MUX, // port 1 = control, port 2+ = mux options
+    FF_NODE,
+    BUF_NODE,
+    INPUT_NODE,
+    OUTPUT_NODE,
+    GND_NODE,
+    VCC_NODE,
+    CLOCK_NODE,
+    ADD,            // +
+    MINUS,          // -
+    BITWISE_NOT,    // ~
+    BITWISE_AND,    // &
+    BITWISE_OR,     // |
+    BITWISE_NAND,   // ~&
+    BITWISE_NOR,    // ~|
+    BITWISE_XNOR,   // ~^
+    BITWISE_XOR,    // ^
+    LOGICAL_NOT,    // !
+    LOGICAL_OR,     // ||
+    LOGICAL_AND,    // &&
+    LOGICAL_NAND,   // No Symbol
+    LOGICAL_NOR,    // No Symbol
+    LOGICAL_XNOR,   // No symbol
+    LOGICAL_XOR,    // No Symbol
+    MULTIPLY,       // *
+    DIVIDE,         // /
+    MODULO,         // %
+    POWER,          // **
+    LT,             // <
+    GT,             // >
+    LOGICAL_EQUAL,  // ==
+    NOT_EQUAL,      // !=
+    LTE,            // <=
+    GTE,            // >=
+    SR,             // >>
+    ASR,            // >>>
+    SL,             // <<
+    ASL,            // <<<
+    CASE_EQUAL,     // ===
+    CASE_NOT_EQUAL, // !==
+    ADDER_FUNC,
+    CARRY_FUNC,
+    MUX_2,
+    SMUX_2, // MUX_2 with single bit selector (no need to add not selector as the second pin) => [SEL] [IN1, IN2] [OUT]
+    BLIF_FUNCTION,
+    NETLIST_FUNCTION,
+    MEMORY,
+    PAD_NODE,
+    HARD_IP,
+    GENERIC,             /*added for the unknown node type */
+    CLOG2,               // $clog2
+    UNSIGNED,            // $unsigned
+    SIGNED,              // $signed
+                         // [START] operations to cover yosys subckt
+    MULTI_BIT_MUX_2,     // like MUX_2 but with n-bit input/output
+    MULTIPORT_nBIT_SMUX, // n-bit input/output in multiple ports
+    PMUX,                // Multiplexer with many inputs using one-hot select signal
+    SDFF,                // data, S to reset value and output port
+    DFFE,                // data, enable to output port
+    SDFFE,               // data, synchronous reset value and enable to output port
+    SDFFCE,              // data, synchronous reset value and enable to reset value and output port
+    DFFSR,               // data, clear and set to output port
+    DFFSRE,              // data, clear and set with enable to output port
+    DLATCH,              // datato output port based on polarity without clk
+    ADLATCH,             // datato output port based on polarity without clk
+    SETCLR,              // set or clear an input pins
+    SPRAM,               // representing primitive single port ram
+    DPRAM,               // representing primitive dual port ram
+    YMEM,                // $mem block memory generated by ysos, can have complete varaiable num of read and write plus clk for each port
+    YMEM2,               // $mem_v2 block memory generated by ysos, can have complete varaiable num of read and write plus clk for each port
+    BRAM,                // Odin-II block memory, from techlib/bram_bb.v
+    ROM,                 // Odin-II read-only memory, from techlib/rom_bb.v
+                         // [END] operations to cover yosys subckt
+    SKIP,                // to skip mapping for this specific node
+    operation_list_END
+};
+
+enum ids {
+    NO_ID,
+    /* top level things */
+    FILE_ITEMS,
+    MODULE,
+    SPECIFY,
+    /* VARIABLES */
+    INPUT,
+    OUTPUT,
+    INOUT,
+    WIRE,
+    REG,
+    GENVAR,
+    PARAMETER,
+    LOCALPARAM,
+    INITIAL,
+    PORT,
+    /* OTHER MODULE ITEMS */
+    MODULE_ITEMS,
+    VAR_DECLARE,
+    VAR_DECLARE_LIST,
+    ASSIGN,
+    /* OTHER MODULE AND FUNCTION ITEMS */
+    FUNCTION,
+    /* OTHER FUNCTION ITEMS */
+    FUNCTION_ITEMS,
+    TASK,
+    TASK_ITEMS,
+    /* primitives */
+    GATE,
+    GATE_INSTANCE,
+    ONE_GATE_INSTANCE,
+    /* Module instances */
+    MODULE_CONNECT_LIST,
+    MODULE_CONNECT,
+    MODULE_PARAMETER_LIST,
+    MODULE_PARAMETER,
+    MODULE_NAMED_INSTANCE,
+    MODULE_INSTANCE,
+    MODULE_MASTER_INSTANCE,
+    ONE_MODULE_INSTANCE,
+    /* Function instances*/
+    FUNCTION_NAMED_INSTANCE,
+    FUNCTION_INSTANCE,
+    TASK_NAMED_INSTANCE,
+    TASK_INSTANCE,
+    /* Specify Items */
+    SPECIFY_ITEMS,
+    SPECIFY_PARAMETER,
+    SPECIFY_PAL_CONNECTION_STATEMENT,
+    SPECIFY_PAL_CONNECT_LIST,
+    /* statements */
+    STATEMENT,
+    BLOCK,
+    NON_BLOCKING_STATEMENT,
+    BLOCKING_STATEMENT,
+    ASSIGNING_LIST,
+    CASE,
+    CASE_LIST,
+    CASE_ITEM,
+    CASE_DEFAULT,
+    ALWAYS,
+    IF,
+    FOR,
+    WHILE,
+    /* Delay Control */
+    DELAY_CONTROL,
+    POSEDGE,
+    NEGEDGE,
+    /* expressions */
+    TERNARY_OPERATION,
+    BINARY_OPERATION,
+    UNARY_OPERATION,
+    /* basic primitives */
+    ARRAY_REF,
+    RANGE_REF,
+    CONCATENATE,
+    REPLICATE,
+    /* basic identifiers */
+    IDENTIFIERS,
+    NUMBERS,
+    /* C functions */
+    C_ARG_LIST,
+    DISPLAY,
+    FINISH,
+    /* Hard Blocks */
+    HARD_BLOCK,
+    HARD_BLOCK_NAMED_INSTANCE,
+    HARD_BLOCK_CONNECT_LIST,
+    HARD_BLOCK_CONNECT,
+    // EDDIE: new enum value for ids to replace MEMORY from operation_t
+    RAM,
+    ids_END
+};
+
+struct metric_t {
+    double min_depth;
+    double max_depth;
+    double avg_depth;
+    double avg_width;
+};
+
+struct stat_t {
+    metric_t upward;
+    metric_t downward;
+};
+
+struct typ {
+    char *identifier;
+    VNumber *vnumber = nullptr;
+    struct {
+        operation_list op;
+    } operation;
+    struct {
+        short is_parameter;
+        short is_string;
+        short is_localparam;
+        short is_defparam;
+        short is_port;
+        short is_input;
+        short is_output;
+        short is_inout;
+        short is_wire;
+        short is_reg;
+        short is_genvar;
+        short is_memory;
+        operation_list signedness;
+        VNumber *initial_value = nullptr;
+    } variable;
+    struct {
+        short is_instantiated;
+        ast_node_t **module_instantiations_instance;
+        int size_module_instantiations;
+    } module;
+    struct {
+        short is_instantiated;
+        ast_node_t **function_instantiations_instance;
+        int size_function_instantiations;
+    } function;
+    struct {
+        short is_instantiated;
+        ast_node_t **task_instantiations_instance;
+        int size_task_instantiations;
+    } task;
+    struct {
+        int num_bit_strings;
+        char **bit_strings;
+    } concat;
+};
+
+struct ast_node_t {
+    loc_t loc;
+
+    long unique_count;
+    int far_tag;
+    int high_number;
+    ids type;
+    typ types;
+
+    ast_node_t *identifier_node;
+    ast_node_t **children;
+    long num_children;
+
+    void *hb_port;
+    void *net_node;
+    long chunk_size;
+};
+
+//-----------------------------------------------------------------------------------------------------
+
+/* DEFINTIONS for carry chain*/
+struct chain_information_t {
+    char *name; // unique name of the chain
+    int count;  // the number of hard blocks in this chain
+    int num_bits;
+};
+
+//-----------------------------------------------------------------------------------------------------
+
+/**
+ * DEFINTIONS netlist node attributes
+ * the attr_t structure provides the control signals sensitivity
+ * In the synthesis flow, the attribute structure is mostly used to
+ * specify the clock sensitivity. However, in the techmap flow,
+ * it is used in most sections, including DFFs, Block memories
+ * and arithmetic operation instantiation
+ */
+struct attr_t {
+    edge_type_e clk_edge_type;   // clock edge sensitivity
+    edge_type_e clr_polarity;    // clear (reset to GND) polarity
+    edge_type_e set_polarity;    // set to VCC polarity
+    edge_type_e enable_polarity; // enable polarity
+    edge_type_e areset_polarity; // asynchronous reset polarity
+    edge_type_e sreset_polarity; // synchronous reset polarity
+
+    long areset_value; // asynchronous reset value
+    long sreset_value; // synchronous reset value
+
+    operation_list port_a_signed;
+    operation_list port_b_signed;
+
+    /* memory node attributes */
+    long size;                   // memory size
+    long offset;                 // ADDR offset
+    char *memory_id;             // the id of memory in verilog file (different from name since for memory it is $mem~#)
+    edge_type_e RD_CLK_ENABLE;   // read clock enable
+    edge_type_e WR_CLK_ENABLE;   // write clock enable
+    edge_type_e RD_CLK_POLARITY; // read clock polarity
+    edge_type_e WR_CLK_POLARITY; // write clock polarity
+    long RD_PORTS;               // Numof read ports
+    long WR_PORTS;               // Num of Write ports
+    long DBITS;                  // Data width
+    long ABITS;                  // Addr width
+};
+
+/* DEFINTIONS for all the different types of nodes there are.  This is also used cross-referenced in utils.c so that I can get a string version
+ * of these names, so if you add new tpyes in here, be sure to add those same types in utils.c */
+struct nnode_t {
+    Yosys::RTLIL::Cell *cell;
+    Yosys::hashlib::dict<Yosys::RTLIL::IdString, Yosys::RTLIL::Const> cell_parameters;
+
+    loc_t loc;
+
+    long unique_id;
+    char *name;          // unique name of a node
+    operation_list type; // the type of node
+    int bit_width;       // Size of the operation (e.g. for adders/subtractors)
+
+    ast_node_t *related_ast_node; // the abstract syntax node that made this node
+
+    uintptr_t traverse_visited; // a way to mark if we've visited yet
+    stat_t stat;
+
+    npin_t **input_pins; // the input pins
+    long num_input_pins;
+    int *input_port_sizes; // info about the input ports
+    int num_input_port_sizes;
+
+    npin_t **output_pins; // the output pins
+    long num_output_pins;
+    int *output_port_sizes; // info if there is ports
+    int num_output_port_sizes;
+
+    short unique_node_data_id;
+    void *node_data; // this is a point where you can add additional data for your optimization or technique
+
+    int forward_level;           // this is your logic level relative to PIs and FFs .. i.e farthest PI
+    int backward_level;          // this is your reverse logic level relative to POs and FFs .. i.e. farthest PO
+    int sequential_level;        // the associated sequential network that the node is in
+    short sequential_terminator; // if this combinational node is a terminator for the sequential level (connects to flip-flop or Output pin
+
+    std::vector<std::vector<BitSpace::bit_value_t>> memory_data;
+
+    // For simulation
+    //    int in_queue;           // Flag used by the simulator to avoid double queueing.
+    //    npin_t** undriven_pins; // These pins have been found by the simulator to have no driver.
+    //    int num_undriven_pins;
+    //    int ratio;                  //clock ratio for clock nodes
+    init_value_e initial_value; // initial net value
+                                //    bool internal_clk_warn = false;
+
+    attr_t *attributes;
+
+    //    bool covered = false;
+
+    // For mixing soft and hard logic optimizations
+    // a field that is used for storing weights towards the
+    // mixing optimization.
+    //  value of -1 is reserved for hardened blocks
+    long weight = 0;
+};
+
+struct npin_t {
+    long unique_id;
+    ids type; // INPUT or OUTPUT
+    char *name;
+    nnet_t *net; // related net
+    int pin_net_idx;
+    nnode_t *node;    // related node
+    int pin_node_idx; // pin on the node where we're located
+    char *mapping;    // name of mapped port from hard block
+
+    edge_type_e sensitivity;
+
+    ////////////////////
+    // For simulation
+
+    bool delay_cycle;
+
+    unsigned long coverage;
+    bool is_default; // The pin is feeding a mux from logic representing an else or default.
+    bool is_implied; // This signal is implied.
+};
+
+struct nnet_t {
+    long unique_id;
+    char *name; // name for the net
+    short combined;
+
+    int num_driver_pins;
+    npin_t **driver_pins; // the pin that drives the net
+
+    npin_t **fanout_pins; // the pins pointed to by the net
+    int num_fanout_pins;  // the list size of pins
+
+    short unique_net_data_id;
+    void *net_data;
+
+    uintptr_t traverse_visited;
+    stat_t stat;
+    /////////////////////
+    // For simulation
+    //////////////////////
+};
+
+struct signal_list_t {
+    npin_t **pins;
+    long count;
+
+    char is_memory;
+    char is_adder;
+};
+
+struct netlist_t {
+    char *identifier;
+
+    nnode_t *gnd_node;
+    nnode_t *vcc_node;
+    nnode_t *pad_node;
+    nnet_t *zero_net;
+    nnet_t *one_net;
+    nnet_t *pad_net;
+    nnode_t **top_input_nodes;
+    int num_top_input_nodes;
+    nnode_t **top_output_nodes;
+    int num_top_output_nodes;
+    nnode_t **ff_nodes;
+    int num_ff_nodes;
+    nnode_t **internal_nodes;
+    int num_internal_nodes;
+    nnode_t **clocks;
+    int num_clocks;
+
+    /* netlist levelized structures */
+    nnode_t ***forward_levels;
+    int num_forward_levels;
+    int *num_at_forward_level;
+    nnode_t **
+      *backward_levels; // NOTE backward levels isn't neccessarily perfect.  Because of multiple output pins, the node can be put closer to POs than
+                        // should be.  To fix, run a rebuild of the list afterwards since the marked "node->backward_level" is correct */
+    int num_backward_levels;
+    int *num_at_backward_level;
+
+    nnode_t ***sequential_level_nodes;
+    int num_sequential_levels;
+    int *num_at_sequential_level;
+    /* these structures store the last combinational node in a level before a flip-flop or output pin */
+    nnode_t ***sequential_level_combinational_termination_node;
+    int num_sequential_level_combinational_termination_nodes;
+    int *num_at_sequential_level_combinational_termination_node;
+
+    STRING_CACHE *nets_sc;
+    STRING_CACHE *out_pins_sc;
+    STRING_CACHE *nodes_sc;
+
+    long long num_of_type[operation_list_END];
+    long long num_of_node;
+    long long num_logic_element;
+    metric_t output_node_stat;
+
+    t_logical_block_type_ptr type;
+    Yosys::Design *design;
+};
+
+#endif // _ODIN_TYPES_H_
diff --git a/parmys-plugin/utils/odin_util.cc b/parmys-plugin/utils/odin_util.cc
new file mode 100644
index 000000000..88d093e3c
--- /dev/null
+++ b/parmys-plugin/utils/odin_util.cc
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "odin_globals.h"
+#include "odin_types.h"
+#include <cstdarg>
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+
+#include "odin_util.h"
+#include "vtr_memory.h"
+#include "vtr_path.h"
+#include "vtr_util.h"
+#include <regex>
+#include <stdbool.h>
+
+// for mkdir
+#ifdef WIN32
+#include <direct.h>
+#define getcwd _getcwd
+#else
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+long shift_left_value_with_overflow_check(long input_value, long shift_by, loc_t loc)
+{
+    if (shift_by < 0)
+        error_message(NETLIST, loc, "requesting a shift left that is negative [%ld]\n", shift_by);
+    else if (shift_by >= (long)ODIN_STD_BITWIDTH - 1)
+        warning_message(NETLIST, loc, "requesting a shift left that will overflow the maximum size of %ld [%ld]\n", shift_by, ODIN_STD_BITWIDTH - 1);
+
+    return input_value << shift_by;
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: name_based_on_op)
+ * 	Get the string version of an operation
+ *-------------------------------------------------------------------------------------------*/
+const char *name_based_on_op(operation_list op)
+{
+    oassert(op < operation_list_END && "OUT OF BOUND operation_list!");
+
+    return operation_list_STR[op][ODIN_STRING_TYPE];
+}
+
+/*---------------------------------------------------------------------------------------------
+ * (function: node_name_based_on_op)
+ * 	Get the string version of a node
+ *-------------------------------------------------------------------------------------------*/
+const char *node_name_based_on_op(nnode_t *node) { return name_based_on_op(node->type); }
+
+/*---------------------------------------------------------------------------------------------
+ * (function: make_full_ref_name)
+ * // {previous_string}.instance_name
+ * // {previous_string}.instance_name^signal_name
+ * // {previous_string}.instance_name^signal_name~bit
+ *-------------------------------------------------------------------------------------------*/
+char *make_full_ref_name(const char *previous, const char * /*module_name*/, const char *module_instance_name, const char *signal_name, long bit)
+{
+    std::stringstream return_string;
+    if (previous)
+        return_string << previous;
+
+    if (module_instance_name)
+        return_string << "." << module_instance_name;
+
+    if (signal_name && (previous || module_instance_name))
+        return_string << "^";
+
+    if (signal_name)
+        return_string << signal_name;
+
+    if (bit != -1) {
+        oassert(signal_name);
+        return_string << "~" << std::dec << bit;
+    }
+    return vtr::strdup(return_string.str().c_str());
+}
+
+/*---------------------------------------------------------------------------------------------
+ *  (function: make_simple_name )
+ *-------------------------------------------------------------------------------------------*/
+std::string make_simple_name(char *input, const char *flatten_string, char flatten_char)
+{
+    oassert(input);
+    oassert(flatten_string);
+
+    std::string input_str = input;
+    std::string flatten_str = flatten_string;
+
+    for (size_t i = 0; i < flatten_str.length(); i++)
+        std::replace(input_str.begin(), input_str.end(), flatten_str[i], flatten_char);
+
+    return input_str;
+}
+
+/*-----------------------------------------------------------------------
+ * (function: my_malloc_struct )
+ *-----------------------------------------------------------------*/
+void *my_malloc_struct(long bytes_to_alloc)
+{
+    void *allocated = vtr::calloc(1, bytes_to_alloc);
+    static long int m_id = 0;
+
+    // ways to stop the execution at the point when a specific structure is built...note it needs to be m_id - 1 ... it's unique_id in most data
+    // structures
+    // oassert(m_id != 193);
+
+    if (allocated == NULL) {
+        fprintf(stderr, "MEMORY FAILURE\n");
+        oassert(0);
+    }
+
+    /* mark the unique_id */
+    *((long int *)allocated) = m_id++;
+
+    return allocated;
+}
+
+/*
+ * Returns a new string consisting of the original string
+ * plus the appendage. Leaves the original string
+ * intact.
+ *
+ * Handles format strings as well.
+ */
+char *append_string(const char *string, const char *appendage, ...)
+{
+    char buffer[vtr::bufsize];
+
+    va_list ap;
+
+    va_start(ap, appendage);
+    vsnprintf(buffer, vtr::bufsize * sizeof(char), appendage, ap);
+    va_end(ap);
+
+    std::string new_string = std::string(string) + std::string(buffer);
+    return vtr::strdup(new_string.c_str());
+}
+
+void passed_verify_i_o_availabilty(nnode_t *node, int expected_input_size, int expected_output_size, const char *current_src, int line_src)
+{
+    if (!node)
+        error_message(UTIL, unknown_location, "node unavailable @%s::%d", current_src, line_src);
+
+    std::stringstream err_message;
+    int error = 0;
+
+    if (expected_input_size != -1 && node->num_input_pins != expected_input_size) {
+        err_message << " input size is " << std::to_string(node->num_input_pins) << " expected 3:\n";
+        for (int i = 0; i < node->num_input_pins; i++)
+            err_message << "\t" << node->input_pins[0]->name << "\n";
+
+        error = 1;
+    }
+
+    if (expected_output_size != -1 && node->num_output_pins != expected_output_size) {
+        err_message << " output size is " << std::to_string(node->num_output_pins) << " expected 1:\n";
+        for (int i = 0; i < node->num_output_pins; i++)
+            err_message << "\t" << node->output_pins[0]->name << "\n";
+
+        error = 1;
+    }
+
+    if (error)
+        error_message(UTIL, node->loc, "failed for %s:%s %s\n", node_name_based_on_op(node), node->name, err_message.str().c_str());
+}
+
+/*
+ * Gets the current time in seconds.
+ */
+double wall_time()
+{
+    auto time_point = std::chrono::system_clock::now();
+    std::chrono::duration<double> time_since_epoch = time_point.time_since_epoch();
+
+    return time_since_epoch.count();
+}
+
+/**
+ * This overrides default sprintf since odin uses sprintf to concatenate strings
+ * sprintf has undefined behavior for such and this prevents string overriding if
+ * it is also given as an input
+ */
+int odin_sprintf(char *s, const char *format, ...)
+{
+    va_list args, args_copy;
+    va_start(args, format);
+    va_copy(args_copy, args);
+
+    const auto sz = std::vsnprintf(nullptr, 0, format, args) + 1;
+
+    try {
+        std::string temp(sz, ' ');
+        std::vsnprintf(&temp.front(), sz, format, args_copy);
+        va_end(args_copy);
+        va_end(args);
+
+        s = strncpy(s, temp.c_str(), temp.length());
+
+        return temp.length();
+
+    } catch (const std::bad_alloc &) {
+        va_end(args_copy);
+        va_end(args);
+        return -1;
+    }
+}
diff --git a/parmys-plugin/utils/odin_util.h b/parmys-plugin/utils/odin_util.h
new file mode 100644
index 000000000..05583595f
--- /dev/null
+++ b/parmys-plugin/utils/odin_util.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _ODIN_UTIL_H_
+#define _ODIN_UTIL_H_
+
+#include <string>
+
+#include "odin_types.h"
+
+long shift_left_value_with_overflow_check(long input_value, long shift_by, loc_t loc);
+
+const char *name_based_on_op(operation_list op);
+const char *node_name_based_on_op(nnode_t *node);
+
+char *make_full_ref_name(const char *previous, const char *module_name, const char *module_instance_name, const char *signal_name, long bit);
+
+std::string make_simple_name(char *input, const char *flatten_string, char flatten_char);
+
+void *my_malloc_struct(long bytes_to_alloc);
+
+char *append_string(const char *string, const char *appendage, ...);
+
+double wall_time();
+
+int odin_sprintf(char *s, const char *format, ...);
+
+void passed_verify_i_o_availabilty(nnode_t *node, int expected_input_size, int expected_output_size, const char *current_src, int line_src);
+
+#endif // _ODIN_UTIL_H_
diff --git a/parmys-plugin/utils/read_xml_config_file.cc b/parmys-plugin/utils/read_xml_config_file.cc
new file mode 100644
index 000000000..7bc81b1a7
--- /dev/null
+++ b/parmys-plugin/utils/read_xml_config_file.cc
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "read_xml_config_file.h"
+#include "odin_globals.h"
+#include "odin_types.h"
+#include "pugixml.hpp"
+#include "read_xml_util.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace pugiutil;
+
+USING_YOSYS_NAMESPACE
+
+config_t configuration;
+
+void read_inputs(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data);
+void read_outputs(pugi::xml_node a_node, const pugiutil::loc_data &loc_data);
+void read_debug_switches(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data);
+void read_optimizations(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data);
+void set_default_optimization_settings(config_t *config);
+
+extern HardSoftLogicMixer *mixer;
+
+/*-------------------------------------------------------------------------
+ * (function: read_config_file)
+ * This reads an XML config file that specifies what we will do in the tool.
+ *
+ * See config_t in types.h to see the data structures used in this read.
+ *-----------------------------------------------------------------------*/
+void read_config_file(const char *file_name)
+{
+    pugi::xml_node config, next;
+
+    /* Parse the xml file */
+    oassert(file_name != NULL);
+
+    pugi::xml_document doc;
+    pugiutil::loc_data loc_data;
+    try {
+        loc_data = pugiutil::load_xml(doc, file_name);
+
+        /* Root element should be config */
+        config = get_single_child(doc, "config", loc_data);
+
+        /* Process the verilog files */
+        next = get_single_child(config, "inputs", loc_data);
+        read_inputs(next, &configuration, loc_data);
+
+        /* Process the output */
+        next = get_single_child(config, "output", loc_data);
+        read_outputs(next, loc_data);
+
+        /* Process the optimizations */
+        set_default_optimization_settings(&configuration);
+        next = get_single_child(config, "optimizations", loc_data, OPTIONAL);
+        if (next) {
+            read_optimizations(next, &configuration, loc_data);
+        }
+
+        /* Process the debug switches */
+        next = get_single_child(config, "debug_outputs", loc_data);
+        read_debug_switches(next, &configuration, loc_data);
+
+    } catch (XmlError &e) {
+        log("error: could not parse xml configuration file '%s': %s\n", file_name, e.what());
+        return;
+    }
+
+    /* Release the full XML tree */
+    return;
+}
+
+/*-------------------------------------------------------------------------
+ * (function: read_verilog_files)
+ *-----------------------------------------------------------------------*/
+void read_inputs(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data)
+{
+    pugi::xml_node child;
+    pugi::xml_node junk;
+
+    child = get_single_child(a_node, "input_type", loc_data, OPTIONAL);
+    if (child != NULL) {
+        file_type_strmap[child.child_value()];
+    }
+
+    child = get_first_child(a_node, "input_path_and_name", loc_data, OPTIONAL);
+    while (child != NULL) {
+        config->list_of_file_names.push_back(child.child_value());
+        child = child.next_sibling(child.name());
+    }
+    return;
+}
+
+/*--------------------------------------------------------------------------
+ * (function: read_outputs)
+ *------------------------------------------------------------------------*/
+void read_outputs(pugi::xml_node a_node, const pugiutil::loc_data &loc_data)
+{
+    pugi::xml_node child;
+
+    child = get_single_child(a_node, "output_type", loc_data, OPTIONAL);
+    if (child != NULL) {
+        file_type_strmap[child.child_value()];
+    }
+
+    child = get_single_child(a_node, "output_path_and_name", loc_data, OPTIONAL);
+    if (child != NULL) {
+        global_args.output_file = child.child_value();
+    }
+
+    child = get_single_child(a_node, "target", loc_data, OPTIONAL);
+    if (child != NULL) {
+        child = get_single_child(child, "arch_file", loc_data, OPTIONAL);
+        if (child != NULL) {
+            /* Two arch files specified? */
+            if (global_args.arch_file != "") {
+                error_message(PARSE_ARGS, unknown_location, "%s", "Error: Arch file specified in config file AND command line\n");
+            }
+            global_args.arch_file = child.child_value();
+        }
+    }
+    return;
+}
+
+/*--------------------------------------------------------------------------
+ * (function: read_workload_generation)
+ *------------------------------------------------------------------------*/
+void read_debug_switches(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data)
+{
+    pugi::xml_node child;
+
+    child = get_single_child(a_node, "output_ast_graphs", loc_data, OPTIONAL);
+
+    child = get_single_child(a_node, "output_netlist_graphs", loc_data, OPTIONAL);
+    if (child != NULL) {
+        config->output_netlist_graphs = atoi(child.child_value());
+    }
+
+    child = get_single_child(a_node, "debug_output_path", loc_data, OPTIONAL);
+    if (child != NULL) {
+        config->debug_output_path = child.child_value();
+    }
+
+    child = get_single_child(a_node, "print_parse_tokens", loc_data, OPTIONAL);
+
+    return;
+}
+
+/*--------------------------------------------------------------------------
+ * (function: set_default_optimization_settings)
+ *------------------------------------------------------------------------*/
+void set_default_optimization_settings(config_t *config)
+{
+    config->min_hard_multiplier = 0;
+    config->fixed_hard_multiplier = 0;
+    config->mult_padding = -1; /* unconn */
+    config->split_hard_multiplier = 1;
+    config->split_memory_width = false;
+    config->split_memory_depth = false;
+    config->fixed_hard_adder = 0;
+    config->min_threshold_adder = 0;
+    return;
+}
+
+/*--------------------------------------------------------------------------
+ * (function: read_optimizations)
+ *------------------------------------------------------------------------*/
+void read_optimizations(pugi::xml_node a_node, config_t *config, const pugiutil::loc_data &loc_data)
+{
+    const char *prop;
+    pugi::xml_node child;
+
+    child = get_single_child(a_node, "multiply", loc_data, OPTIONAL);
+    if (child != NULL) {
+        prop = get_attribute(child, "size", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->min_hard_multiplier = atoi(prop);
+        } else /* Default: No minimum hard multiply size */
+            config->min_hard_multiplier = 0;
+
+        prop = get_attribute(child, "padding", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->mult_padding = atoi(prop);
+        } else /* Default: Pad to hbpad pins */
+            config->mult_padding = -1;
+
+        prop = get_attribute(child, "fixed", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->fixed_hard_multiplier = atoi(prop);
+        } else /* Default: No fixed hard multiply size */
+            config->fixed_hard_multiplier = 0;
+
+        prop = get_attribute(child, "fracture", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->split_hard_multiplier = atoi(prop);
+        } else /* Default: use fractured hard multiply size */
+            config->split_hard_multiplier = 1;
+    }
+
+    child = get_single_child(a_node, "mix_soft_hard_blocks", loc_data, OPTIONAL);
+    if (child != NULL) {
+        prop = get_attribute(child, "mults_ratio", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            float ratio = atof(prop);
+            if (ratio >= 0.0 && ratio <= 1.0) {
+                delete mixer->_opts[MULTIPLY];
+                mixer->_opts[MULTIPLY] = new MultsOpt(ratio);
+            }
+        }
+
+        prop = get_attribute(child, "exact_mults", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            int exact = atoi(prop);
+            if (exact >= 0) {
+                delete mixer->_opts[MULTIPLY];
+                mixer->_opts[MULTIPLY] = new MultsOpt(exact);
+            }
+        }
+    }
+
+    child = get_single_child(a_node, "memory", loc_data, OPTIONAL);
+    if (child != NULL) {
+        prop = get_attribute(child, "split_memory_width", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->split_memory_width = atoi(prop);
+        } else /* Default: Do not split memory width! */
+            config->split_memory_width = 0;
+
+        prop = get_attribute(child, "split_memory_depth", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            if (strcmp(prop, "min") == 0)
+                config->split_memory_depth = -1;
+            else if (strcmp(prop, "max") == 0)
+                config->split_memory_depth = -2;
+            else
+                config->split_memory_depth = atoi(prop);
+        } else /* Default: Do not split memory depth! */
+            config->split_memory_depth = 0;
+    }
+
+    child = get_single_child(a_node, "adder", loc_data, OPTIONAL);
+    if (child != NULL) {
+        prop = get_attribute(child, "size", loc_data, OPTIONAL).as_string(NULL);
+
+        prop = get_attribute(child, "threshold_size", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->min_threshold_adder = atoi(prop);
+        } else /* Default: No minimum hard adder size */
+            config->min_threshold_adder = 0;
+
+        prop = get_attribute(child, "padding", loc_data, OPTIONAL).as_string(NULL);
+
+        prop = get_attribute(child, "fixed", loc_data, OPTIONAL).as_string(NULL);
+        if (prop != NULL) {
+            config->fixed_hard_adder = atoi(prop);
+        } else /* Default: Fixed hard adder size */
+            config->fixed_hard_adder = 1;
+
+        prop = get_attribute(child, "fracture", loc_data, OPTIONAL).as_string(NULL);
+    }
+
+    return;
+}
\ No newline at end of file
diff --git a/parmys-plugin/utils/read_xml_config_file.h b/parmys-plugin/utils/read_xml_config_file.h
new file mode 100644
index 000000000..02f7a960f
--- /dev/null
+++ b/parmys-plugin/utils/read_xml_config_file.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _READ_XML_CONFIG_FILE_H_
+#define _READ_XML_CONFIG_FILE_H_
+
+#include "odin_types.h"
+
+extern void read_config_file(const char *file_name);
+#endif // _READ_XML_CONFIG_FILE_H_
diff --git a/parmys-plugin/utils/string_cache.cc b/parmys-plugin/utils/string_cache.cc
new file mode 100644
index 000000000..b34dc13b7
--- /dev/null
+++ b/parmys-plugin/utils/string_cache.cc
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "string_cache.h"
+#include "vtr_memory.h"
+#include "vtr_util.h"
+#include <stdio.h>
+#include <string.h>
+
+unsigned long string_hash(STRING_CACHE *sc, const char *string);
+void generate_sc_hash(STRING_CACHE *sc);
+
+unsigned long string_hash(STRING_CACHE *sc, const char *string)
+{
+    long a, mod, mul;
+
+    a = 0;
+    mod = sc->mod;
+    mul = sc->mul;
+    for (int i = 0; string[i]; i++)
+        a = (a * mul + (unsigned char)string[i]) % mod;
+    return a;
+}
+
+void generate_sc_hash(STRING_CACHE *sc)
+{
+    long hash;
+
+    if (sc->string_hash != NULL)
+        vtr::free(sc->string_hash);
+    if (sc->next_string != NULL)
+        vtr::free(sc->next_string);
+    sc->string_hash_size = sc->size * 2 + 11;
+    sc->string_hash = (long *)sc_do_alloc(sc->string_hash_size, sizeof(long));
+    sc->next_string = (long *)sc_do_alloc(sc->size, sizeof(long));
+    memset(sc->string_hash, 0xff, sc->string_hash_size * sizeof(long));
+    memset(sc->next_string, 0xff, sc->size * sizeof(long));
+    for (long i = 0; i < sc->free; i++) {
+        hash = string_hash(sc, sc->string[i]) % sc->string_hash_size;
+        sc->next_string[i] = sc->string_hash[hash];
+        sc->string_hash[hash] = i;
+    }
+}
+
+STRING_CACHE *sc_new_string_cache(void)
+{
+    STRING_CACHE *sc;
+
+    sc = (STRING_CACHE *)sc_do_alloc(1, sizeof(STRING_CACHE));
+    sc->size = 100;
+    sc->string_hash_size = 0;
+    sc->string_hash = NULL;
+    sc->next_string = NULL;
+    sc->free = 0;
+    sc->string = (char **)sc_do_alloc(sc->size, sizeof(char *));
+    sc->data = (void **)sc_do_alloc(sc->size, sizeof(void *));
+    sc->mod = 834535547;
+    sc->mul = 247999;
+    generate_sc_hash(sc);
+    return sc;
+}
+
+long sc_lookup_string(STRING_CACHE *sc, const char *string)
+{
+    long i, hash;
+
+    if (sc == NULL) {
+        return -1;
+    } else {
+        hash = string_hash(sc, string) % sc->string_hash_size;
+        i = sc->string_hash[hash];
+        while (i >= 0) {
+            if (!strcmp(sc->string[i], string))
+                return i;
+            i = sc->next_string[i];
+        }
+        return -1;
+    }
+}
+
+long sc_add_string(STRING_CACHE *sc, const char *string)
+{
+    long i;
+    long hash;
+    void *a;
+
+    i = sc_lookup_string(sc, string);
+    if (i >= 0)
+        return i;
+    if (sc->free >= sc->size) {
+        sc->size = sc->size * 2 + 10;
+
+        a = sc_do_alloc(sc->size, sizeof(char *));
+        if (sc->free > 0)
+            memcpy(a, sc->string, sc->free * sizeof(char *));
+        vtr::free(sc->string);
+        sc->string = (char **)a;
+
+        a = sc_do_alloc(sc->size, sizeof(void *));
+        if (sc->free > 0)
+            memcpy(a, sc->data, sc->free * sizeof(void *));
+        vtr::free(sc->data);
+        sc->data = (void **)a;
+
+        generate_sc_hash(sc);
+    }
+    i = sc->free;
+    sc->free++;
+    sc->string[i] = vtr::strdup(string);
+    sc->data[i] = NULL;
+    hash = string_hash(sc, string) % sc->string_hash_size;
+    sc->next_string[i] = sc->string_hash[hash];
+    sc->string_hash[hash] = i;
+    return i;
+}
+
+void *sc_do_alloc(long a, long b)
+{
+    void *r;
+
+    if (a < 1)
+        a = 1;
+    if (b < 1)
+        b = 1;
+    r = vtr::calloc(a, b);
+    while (r == NULL) {
+        fprintf(stderr, "Failed to allocated %ld chunks of %ld bytes (%ld bytes total)\n", a, b, a * b);
+        r = vtr::calloc(a, b);
+    }
+    return r;
+}
+
+STRING_CACHE *sc_free_string_cache(STRING_CACHE *sc)
+{
+    if (sc != NULL) {
+        if (sc->string != NULL) {
+            for (long i = 0; i < sc->free; i++) {
+                if (sc->string[i] != NULL) {
+                    vtr::free(sc->string[i]);
+                }
+                sc->string[i] = NULL;
+            }
+            vtr::free(sc->string);
+        }
+        sc->string = NULL;
+
+        if (sc->data != NULL) {
+            vtr::free(sc->data);
+        }
+        sc->data = NULL;
+
+        if (sc->string_hash != NULL) {
+            vtr::free(sc->string_hash);
+        }
+        sc->string_hash = NULL;
+
+        if (sc->next_string != NULL) {
+            vtr::free(sc->next_string);
+        }
+        sc->next_string = NULL;
+
+        vtr::free(sc);
+    }
+    sc = NULL;
+
+    return sc;
+}
diff --git a/parmys-plugin/utils/string_cache.h b/parmys-plugin/utils/string_cache.h
new file mode 100644
index 000000000..bba9291b2
--- /dev/null
+++ b/parmys-plugin/utils/string_cache.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2022 CAS—Atlantic (University of New Brunswick, CASA)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _STRING_CACHE_H_
+#define _STRING_CACHE_H_
+
+struct STRING_CACHE {
+    long size;
+    long string_hash_size;
+    long free;
+    long mod;
+    long mul;
+    char **string;
+    void **data;
+    long *string_hash;
+    long *next_string;
+};
+
+/* creates the hash where it is indexed by a string and the void ** holds the data */
+STRING_CACHE *sc_new_string_cache(void);
+/* returns an index of the spot where string is */
+long sc_lookup_string(STRING_CACHE *sc, const char *string);
+/* adds an element into the cache and returns and id...check with cache_name->data[i] == NULL to see if already added */
+long sc_add_string(STRING_CACHE *sc, const char *string);
+void *sc_do_alloc(long, long);
+
+/* free the cache */
+STRING_CACHE *sc_free_string_cache(STRING_CACHE *sc);
+
+#endif // _STRING_CACHE_H_
diff --git a/third_party/mips32r1_core b/third_party/mips32r1_core
new file mode 160000
index 000000000..2dea2c58a
--- /dev/null
+++ b/third_party/mips32r1_core
@@ -0,0 +1 @@
+Subproject commit 2dea2c58abefd46ab6200449420d72454d496955
diff --git a/third_party/vtr_flow/LICENSE.md b/third_party/vtr_flow/LICENSE.md
new file mode 100644
index 000000000..01332da43
--- /dev/null
+++ b/third_party/vtr_flow/LICENSE.md
@@ -0,0 +1,69 @@
+# VTR License
+
+The software package "VTR" includes the software tools ODIN II, ABC, and VPR as
+well as additional benchmarks, documentation, libraries and scripts. The authors
+of the various components of VTR retain their ownership of their tools.
+
+* Unless otherwise noted (in particular ABC, the benchmark circuits and some libraries),
+all software, documents, and scripts in VTR, follows the standard MIT license described
+[here](http://www.opensource.org/licenses/mit-license.php) copied below for
+your convenience:
+
+> The MIT License (MIT)
+>
+> Copyright 2012 VTR Developers
+>
+> Permission is hereby granted, free of charge, to any person obtaining a copy of
+> this software and associated documentation files (the "Software"), to deal in
+> the Software without restriction, including without limitation the rights to
+> use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+> of the Software, and to permit persons to whom the Software is furnished to do
+> so, subject to the following conditions:
+>
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+> SOFTWARE.
+
+* Terms and conditions for ABC is found
+[here](http://www.eecs.berkeley.edu/~alanmi/abc/copyright.htm) copied below
+for your convenience:
+
+> Copyright (c) The Regents of the University of California. All rights reserved.
+>
+> Permission is hereby granted, without written agreement and without license or
+> royalty fees, to use, copy, modify, and distribute this software and its
+> documentation for any purpose, provided that the above copyright notice and the
+> following two paragraphs appear in all copies of this software.
+>
+> IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+> DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+> THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
+> CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+>
+> THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+> BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+> A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
+> AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
+> SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
+The benchmark circuits are all open source but each have their own
+individual terms and conditions which are listed in the source code of each
+benchmark.
+
+Subject to these conditions, the software is provided free of charge to all
+interested parties.
+
+If you do decide to use this tool, please reference our work as references are
+important in academia.
+
+Donations in the form of research grants to promote further research and
+development on the tools will be gladly accepted, either anonymously or with
+attribution on our future publications.
+
diff --git a/third_party/vtr_flow/README.md b/third_party/vtr_flow/README.md
new file mode 100755
index 000000000..53558ddb8
--- /dev/null
+++ b/third_party/vtr_flow/README.md
@@ -0,0 +1,7 @@
+# VTR Flow
+
+This folder contains architecture files, benchmark circuits and scripts for running the VTR flow.
+
+For a description see the ['Running the VTR Flow' documentation at https://docs.verilogtorouting.org](https://docs.verilogtorouting.org/en/latest/vtr/running_vtr/),
+or the [`run_vtr_flow.rst` file](../doc/src/vtr/run_vtr_flow.rst) in the [`doc` folder](../doc)
+in the [root of the VTR source tree](https://github.com/SymbiFlow/vtr-verilog-to-routing).
diff --git a/third_party/vtr_flow/arch/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml b/third_party/vtr_flow/arch/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
new file mode 100644
index 000000000..8170d72b0
--- /dev/null
+++ b/third_party/vtr_flow/arch/k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml
@@ -0,0 +1,3246 @@
+<!--
+    This is the architecture file for a modern Intel FPGA. The blocks (logic, RAM, DSP)
+    are Agilex-like, but the routing architecture is similar to Stratix IV. It is based
+    off the Stratix-10-like Architecture discussed in [1], the Agilex-like Architecture
+    mentioned in [6] and Stratix-IV-like Architecture mentioned in [5].
+
+    The delays and areas of various components in this arch come from COFFE [2]
+    runs using a 22nm technology node [3].
+
+    ##############################
+    Parameters
+    ##############################
+    Parameter | Value | Definition
+    __________|_______|______________________________
+    N         |    10 | Number of BLEs per cluster
+    W         |   300 | Channel width
+    L         |  4,16 | Wire segment length
+    I         |    60 | Number of cluster inputs
+    O         |    40 | Number of cluster outputs
+    K         |     6 | LUT size
+    Fs        |     3 | Switch block flexibility
+    Fcin      |  0.15 | Cluster input flexibility
+    Fcout     |   0.1 | Cluster output flexibility
+    Fclocal   |   0.5 | Local input crossbar population
+
+    ##############################
+    Logic Cluster
+    ##############################
+    This architecture has 10 ALMs (or FLEs: Fracturable Logic Elements) per Logic Cluster
+    (or LAB or CLB), where each ALM is a 6-LUT fracturable into
+    two 5-LUTs. The ALM has 8 inputs and 4 optionally registered outputs.The two 5-LUTs should
+    share at least two inputs. Each two ALM outputs are logically equivalent, which means any
+    output signal that can reach ALM.out[0] can reach ALM.out[1] and the same thing for
+    ALM.out[2] and ALM.out[3]. The ALMs in this architecture have an arithmetic mode
+    where each 5-LUT is fractured into two 4-LUTs, resulting in a total of four 4-LUTs and two
+    bits of addition per ALM. This architecture has a single carry chain that spans the 10 ALMs
+    in the LAB.
+
+    The LAB (or Logic Cluster or CLB) has 60 inputs and 40 outputs. Two outputs of each ALM are fed 
+    to the right and left LAB using direct links and are also fed back to the LAB as feedback connections 
+    sharing the 60 input ports with the signals coming from the routing channels.
+
+    The LAB has a 50% sparsely populated input crossbar.
+    
+    ##############################
+    DSP Slice
+    ##############################
+    This architecture has a DSP block that supports the following modes:
+
+    Fixed point modes:
+    _________________
+    1. 27x27 fixed point multiplier (multiply)
+    2. 27x27 fixed point mac (mac_int_27x27)
+    3. Two 18x19 fixed point multipliers (multiply)
+    4. Two 18x19 fixed point macs (mac_int_18x19)
+    5. Four 9x9 fixed point multipliers (multiply)
+    6. Four 9x9 fixed point macs (mac_int_9x9)
+    7. 27x27 plus 64 mode (mult_add_mode_27_27_64/mult_add_int_27x27). 27 * 27 + 64 -> 64. result = ax * ay + bx + chainin. chainout = result 
+    8. 18x19 sum-of-2 mode (sop_2_mode/int_sop_2) result = (bx * by) + (ax * ay) + chainin. chainout = result    
+    9. 18x19 plus 36 mode (mult_add_mode_18_19_36/mult_add_int_18x19). 18 * 19 + 36 -> 64. result = ax * ay + bx + chainin. chainout = result 
+    10. 9x9 sum-of-4 mode (sop_4_mode/int_sop_4) result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin. chainout = result 
+    11. 9x9 sum-of-4 accum mode (sop_4_accum_mode/int_sop_accum_4) result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin + accumulator. chainout = result 
+
+    Floating point modes:
+    ____________________
+
+    IMPORTANT:
+    The precisions supported are IEEE floating point 32-bit, IEEE floating point 16-bit and
+    Brain floating point (BF16). In the 16-bit mode descriptions, wherever "fp16" is used, it
+    refers to either IEEE floating point 16-bit or BF16. There are mode bits on the DSP slice
+    that can be used to differentiate between them. Doing this saves the effort of explicitly
+    specifying all the 16-bit modes twice in this file. 
+    Since the goal is architectural exploration and not functional simulation, the mode bits 
+    can be specified to any random value while instantiating the DSP slice in a Verilog benchmark.
+
+    1A. One fp32 multiplier (mult_fp_32)
+    1B. One fp32 multiplier, clocked (mult_fp_clk_32)
+    2A. One fp32 adder/subtractor (addition_fp_32)
+    2B. One fp32 adder/subtractor, clocked (addition_fp_clk_32)
+    3. One fp32 mac (mac_fp_32)
+    4A. Two fp16 multipliers (mult_fp_16)
+    4B. Two fp16 multipliers, clocked (mult_fp_clk_16)
+    5A. Two fp16 adders/subtractors (addition_fp_16)
+    5B. Two fp16 adders/subtractors, clocked (addition_fp_clk_16)
+    6. Two fp16 macs (mac_fp_16)
+    7. floating point fp16 sum-of-products mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b. chainout = third_inp or result) (fp16_sum_of_products_mode/fp16_mult_add)
+    8. floating point fp16 sum-of-2 mult mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b + fp32 chainin or third inp. chainout = third_inp or result) (fp16_sum_of_products_2_mult_mode/fp16_sop2_mult)
+    9. floating point fp16 sum-of-2 accum mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b + accumulator. chainout = result) (fp16_sum_of_products_2_accum_mode/fp16_sop2_accum)
+    10. floating point fp16 mult, fp32 add mode (chainout = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b. result = chainin + third_inp) (fp16_mult_fp32_add/fp16_mult_fp32_add)
+    11. floating point fp16 mult, fp32 accum mode (chainout = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b. result = third_inp + accumulator) (fp16_mult_fp32_accum/fp16_mult_fp32_accum)
+    12. floating point fp32 mult_then_add mode (result = fp32_mult_a * fp32_mult_b + chainin. chainout = third_inp or result) (fp32_mult_then_add/fp32_mult_then_add)
+    13. floating point fp32 mult_add mode (chainout = fp32_mult_a * fp32_mult_b. chainout = third_inp + chainin) (fp32_mult_add/fp32_mult_add)
+
+    The DSP block was designed in Verilog and COFFE's [2] hybrid flow was used to generate
+    area and delay results. The standard cell library used was Cadence GPDK 45nm (gsclib045_svt_v4.4)
+    and area/delay scaling euqations from [4] were used.
+
+    A 50% sparsely populated input crossbar was added to the DSP block but is commented out.
+    It was leading to a failure in in VPR. See the discussion on this commit: 
+    https://github.com/verilog-to-routing/vtr-verilog-to-routing/commit/ea7acf1582ece35e892c26b756aa302d2e12ddb2
+
+    Once this is fixed, the input crossbar code can be enabled.
+
+    ##############################
+    Memory Blocks
+    ##############################
+    The architecture also has a 20Kb memory blocks (or M20k or BRAM) that has true and simple dual port modes. 
+    In simple dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10,
+    while in true dual port mode it can be configured as: 1024x20 and 2048x10.
+
+    The BRAM has registered inputs and outputs. See details on how the delays for this block were 
+    obtained, in the comments before the specification of the BRAM primitive, towards the end of this file.
+
+    The BRAM doesn't have an input crossbar. Adding an input crossbar was leading to a 
+    seg fault in VPR, likely because of https://github.com/verilog-to-routing/vtr-verilog-to-routing/issues/1475
+
+    Once this is fixed, an input crossbar can be added. The input crossbar delay from COFFE was: 29.47ps
+
+    ##############################
+    Routing/Interconnect
+    ##############################
+    The routing channel width is 300. Note that the channel width isn't specified directly in this arch file. 
+    Switch pattern calculations assume that value. During experiments, channel width can be specified using 
+    the command line switch `route_chan_width`.
+    The architecture uses unidirectional routing with wire segments of length 4 (260 out of 300 wires) and 
+    length 16 (40 out of 300 wires). The length 16 wires do not directly connect to block pins and are only 
+    accessible from the length 4 wires. Switches appear after every 4 blocks on the length 16 wires. 
+    The switch blocks use a custom switching pattern based on the Stratix-IV-like architecture used in the 
+    Titan flow [5]. 
+
+    ##############################
+    I/Os
+    ##############################
+    I/O pads are arranged along the perimeter of the FPGA. No area values provided for the I/Os.
+
+    ##############################
+    Comments on similarities and differences with Intel FPGA architecture.
+    ##############################
+    The main parameters of the logic blocks, DSPs and RAMs are similar to Intel FPGAs. But here are
+    some important points:
+    1. The DSP slice supports lower precision modes - int8 (actually 9x9) and 16-bit floating point
+       (IEEE half-precision and bfloat16). These modes are present in Intel Agilex FPGA DSPs.
+    2. DSPs are chained in vertical direction (chainin-chainout connections for output cascading
+       and scanin-scanout connections for input cascading). This is a common feature
+       in modern FPGAs.
+    3. There are no registers on the interconnect/routing wires in this architecture. That is a main
+       feature in the Stratix10 and Agilex families of Intel FPGA (it's called HyperFlex by Intel).
+    4. The architecture doesn't have sectors. All blocks are laid out in columns on the entire chip.
+       Most modern Intel FPGAs have sector based layout.
+    5. The IOs are on the perimeter, instead of being arranged in columns. Modern FPGAs arrange I/Os in
+       columns.
+    6. The routing architecture is similar to Stratix IV. There are wire segments of L=4 
+       and L=16. And a custom switch pattern (not a standard wilton switch) is used. 
+
+    [1] M. Eldafrawy, A. Boutros, S. Yazdanshenas, and V. Betz, "FPGA Logic Block Architectures for
+        Efficient Deep Learning Inference" in ACM TRETS, 2020
+    [2] S. Yazdanshenas, and V. Betz, "COFFE 2: Automatic Modelling and Optimization of
+        Complex and Heterogeneous FPGA Architectures" in ACM TRETS, 2019. 
+    [3] PTM High Performance 22nm Metal Gate / High-K / Strained-Si 22NM_BULK_HP, from http://ptm.asu.edu/
+        See: https://github.com/vaughnbetz/COFFE/blob/master/spice_models/ptm_22nm_bulk_hp.l
+    [4] A. Stillmaker and B. Baas, "Scaling equations for the accurate prediction of CMOS device 
+        performance from 180 nm to 7 nm" in Integration, the VLSI Journal (2017)
+    [5] K. E. Murray et al., “Timing-Driven Titan: Enabling Large Benchmarks and Exploring the Gap between 
+        Academic and Commercial CAD,” TRETS 2015.
+    [6] A. Arora et al., "Tensor Slices to the Rescue: Supercharging ML Acceleration on FPGAs", ISFPGA 2020.
+-->
+
+<architecture>
+  <!-- 
+         ODIN II specific config begins 
+         Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+         ".model [type_of_block]") that this architecture supports.
+
+         Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+         already special structures in blif (.names, .input, .output, and .latch) 
+         that describe them.
+    -->
+  <models>
+    <model name="single_port_ram">
+      <input_ports>
+        <port name="we" clock="clk" combinational_sink_ports="out"/>
+        <!-- control -->
+        <port name="addr" clock="clk" combinational_sink_ports="out"/>
+        <!-- address lines -->
+        <port name="data" clock="clk" combinational_sink_ports="out"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <model name="dual_port_ram">
+      <input_ports>
+        <port name="we1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- write enable -->
+        <port name="we2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- write enable -->
+        <port name="addr1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- address lines -->
+        <port name="addr2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- address lines -->
+        <port name="data1" clock="clk" combinational_sink_ports="out1"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk" combinational_sink_ports="out2"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out1" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <!-- Used inside DSPs. 
+         Fixed point multiplication.
+         ODIN infers these when * sign appears in RTL. -->
+    <model name="multiply">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <!-- Used inside DSPs. 
+         Floating point multiplication. -->
+    <model name="mult_fp_16">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>   
+    <model name="mult_fp_32">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>   
+    <model name="mult_fp_clk_16">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <model name="mult_fp_clk_32">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <!-- Only used inside CLBs for 1-bit adder.
+         ODIN infers these when + sign appears in RTL.
+         Can't use this inside DSP slice
+         because ODIN gets confused and starts to connect multi
+         bit adders and single bit adders in different PBs -->
+    <model name="adder">
+      <input_ports>
+        <port name="a" combinational_sink_ports="cout sumout"/>
+        <port name="b" combinational_sink_ports="cout sumout"/>
+        <port name="cin" combinational_sink_ports="cout sumout"/>
+      </input_ports>
+      <output_ports>
+        <port name="cout"/>
+        <port name="sumout"/>
+      </output_ports>
+    </model>
+    <!-- Multi bit floating point adder inside DSP slices -->
+    <model name="addition_fp_16">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <model name="addition_fp_32">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <model name="addition_fp_clk_16">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <model name="addition_fp_clk_32">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_2">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="mult_add_int_27x27">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="scanin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+        <port name="scanout"/>
+      </output_ports>
+    </model>
+    <model name="mult_add_int_18x19">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+        <port name="scanin" clock="clk" combinational_sink_ports="result scanout chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+        <port name="scanout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_4">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="int_sop_accum_4">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ax" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="ay" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="by" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="cy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dx" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="dy" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!-- Floating point MAC inside DSP slices -->
+    <model name="mac_fp_16">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="out"/>
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <model name="mac_fp_32">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="out"/>
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <!-- Fixed point MAC inside DSP slices -->
+    <model name="mac_int_27x27">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="out"/>
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <model name="mac_int_18x19">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="out"/>
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <model name="mac_int_9x9">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="out"/>
+        <port name="a" clock="clk" combinational_sink_ports="out"/>
+        <port name="b" clock="clk" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="fp16_mult_add">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model> 
+    <!--A mode in DSP slice-->
+    <model name="fp16_sop2_mult">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model> 
+    <!--A mode in DSP slice-->
+    <model name="fp16_sop2_accum">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_b" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="fp16_mult_fp32_add">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>    
+    <!--A mode in DSP slice-->
+    <model name="fp16_mult_fp32_accum">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="top_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="bot_b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model>
+    <!--A mode in DSP slice-->
+    <model name="fp32_mult_add">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model> 
+    <!--A mode in DSP slice-->
+    <model name="fp32_mult_then_add">
+      <input_ports>
+        <port name="clk" is_clock="1"/>  
+        <port name="reset" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="mode_sigs" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="a" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="b" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="fp32_in" clock="clk" combinational_sink_ports="result chainout"/>
+        <port name="chainin" clock="clk" combinational_sink_ports="result chainout"/>
+      </input_ports>
+      <output_ports>
+        <port name="result" clock="clk"/>
+        <port name="chainout"/>
+      </output_ports>
+    </model> 
+  </models>
+  <tiles>
+    <tile name="io" area="0">
+      <sub_tile name="io" capacity="8">
+        <equivalent_sites>
+          <site pb_type="io" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="custom">
+          <loc side="left">io.outpad io.inpad io.clock</loc>
+          <loc side="top">io.outpad io.inpad io.clock</loc>
+          <loc side="right">io.outpad io.inpad io.clock</loc>
+          <loc side="bottom">io.outpad io.inpad io.clock</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="clb" height="1" width="1" area="27905">
+      <sub_tile name="clb">
+        <equivalent_sites>
+          <site pb_type="clb" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="I1" num_pins="15" equivalent="full"/>
+        <input name="I2" num_pins="15" equivalent="full"/>
+        <input name="I3" num_pins="15" equivalent="full"/>
+        <input name="I4" num_pins="15" equivalent="full"/>
+        <input name="cin" num_pins="1"/>
+        <output name="O" num_pins="40" equivalent="none"/>
+        <output name="cout" num_pins="1"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <fc_override port_name="cin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="cout" fc_type="frac" fc_val="0"/>
+          <!-- clock pins do not connect to local routing -->
+          <fc_override fc_type="frac" fc_val="0" port_name="clk"/>
+        </fc>
+        <pinlocations pattern="spread"/>
+      </sub_tile>
+    </tile>
+    <tile name="dsp_top" height="4" width="1" area="253779">
+      <sub_tile name="dsp_top">
+        <equivalent_sites>
+          <site pb_type="dsp_top" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="reset" num_pins="1" is_non_clock_global="true"/>
+        <input name="dsp_I1" num_pins="64" />
+        <input name="dsp_I2" num_pins="64" />
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <!-- clock pins and chain ports do not connect to local routing -->
+          <fc_override port_name="clk" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="chainout" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="scanout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="custom">
+        	  <loc side="left" yoffset="0">dsp_top.dsp_I1[31:0]</loc>
+	          <loc side="right" yoffset="1">dsp_top.dsp_I1[63:32]</loc>
+	          <loc side="left" yoffset="2">dsp_top.dsp_I2[31:0]</loc>
+	          <loc side="right" yoffset="3">dsp_top.dsp_I2[63:32]</loc>
+	          <loc side="top">dsp_top.chainin dsp_top.scanin</loc>
+	          <loc side="bottom">dsp_top.chainout dsp_top.scanout</loc>
+	          <loc side="right" yoffset="0">dsp_top.result[17:0] dsp_top.clk</loc>
+	          <loc side="left" yoffset="1">dsp_top.result[36:18]</loc>
+	          <loc side="right" yoffset="2">dsp_top.result[55:37] </loc>
+	          <loc side="left" yoffset="3">dsp_top.result[73:56] dsp_top.reset</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="memory" height="2" width="1" area="137668">
+      <sub_tile name="memory">
+        <equivalent_sites>
+          <site pb_type="memory" pin_mapping="direct"/>
+        </equivalent_sites>
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <fc_override fc_type="frac" fc_val="0" port_name="clk"/>
+        </fc>  
+        <pinlocations pattern="spread"/>
+      </sub_tile>
+    </tile>
+  </tiles>
+  <!-- ODIN II specific config ends -->
+  <layout>
+    <!-- Physical descriptions begin -->
+    <auto_layout aspect_ratio="1.0">
+      <perimeter type="io" priority="101"/>
+      <corners type="EMPTY" priority="102"/>
+      <fill type="clb" priority="10"/>
+      <col type="dsp_top" startx="6" starty="1" repeatx="16" priority="20"/>
+      <col type="memory" startx="2" starty="1" repeatx="16" priority="20"/>
+    </auto_layout>
+    <!--
+    <fixed_layout name="mylayout" width="178" height="82">
+      <perimeter type="io" priority="101"/>
+      <corners type="EMPTY" priority="102"/>
+      
+      <col type="dsp_top"  startx="1"  starty="1"  priority="100"/>
+      <col type="clb"  startx="2"  starty="1"  priority="100"/>
+      <col type="clb"  startx="3"  starty="1"  priority="100"/>
+      <col type="dsp_top"  startx="4"  starty="1"  priority="100"/>
+      <col type="clb"  startx="5"  starty="1"  priority="100"/>
+      <col type="clb"  startx="6"  starty="1"  priority="100"/>
+      <col type="dsp_top"  startx="7"  starty="1"  priority="100"/>
+      <col type="clb"  startx="8"  starty="1"  priority="100"/>
+      <col type="clb"  startx="9"  starty="1"  priority="100"/>
+      <col type="dsp_top"  startx="10"  starty="1"  priority="100"/>
+      <col type="clb"  startx="11"  starty="1"  priority="100"/>
+      <col type="clb"  startx="12"  starty="1"  priority="100"/>
+      <col type="dsp_top"  startx="13"  starty="1"  priority="100"/>
+
+      <region type="clb" startx="14"   endx="88"   starty="1" incrx="5"  priority="20"/>
+      <region type="clb" startx="15"   endx="88"   starty="1" incrx="5"  priority="20"/>
+      <region type="clb" startx="16"   endx="88"   starty="1" incrx="5"  priority="20"/>
+      <region type="dsp_top" startx="17"   endx="88"   starty="1" incrx="5"  priority="20"/>
+      <region type="memory" startx="18"   endx="88"   starty="1" incrx="5"  priority="20"/>
+
+      <region type="memory" startx="89"   endx="163"   starty="1" incrx="5"  priority="20"/>
+      <region type="dsp_top" startx="90"   endx="163"   starty="1" incrx="5"  priority="20"/>
+      <region type="clb" startx="91"   endx="163"   starty="1" incrx="5"  priority="20"/>
+      <region type="clb" startx="92"   endx="163"   starty="1" incrx="5"  priority="20"/>
+      <region type="clb" startx="93"   endx="163"   starty="1" incrx="5"  priority="20"/>
+
+      <col type="dsp_top"  startx="164"  starty="1"  priority="20"/>
+      <col type="clb"  startx="165"  starty="1"  priority="1"/>
+      <col type="clb"  startx="166"  starty="1"  priority="1"/>
+      <col type="dsp_top"  startx="167"  starty="1"  priority="20"/>
+      <col type="clb"  startx="168"  starty="1"  priority="1"/>
+      <col type="clb"  startx="169"  starty="1"  priority="1"/>
+      <col type="dsp_top"  startx="170"  starty="1"  priority="20"/>
+      <col type="clb"  startx="171"  starty="1"  priority="1"/>
+      <col type="clb"  startx="172"  starty="1"  priority="1"/>
+      <col type="dsp_top"  startx="173"  starty="1"  priority="20"/>
+      <col type="clb"  startx="174"  starty="1"  priority="1"/>
+      <col type="clb"  startx="175"  starty="1"  priority="1"/>
+      <col type="dsp_top"  startx="176"  starty="1"  priority="20"/>
+    </fixed_layout> 
+    -->
+  </layout>
+  <device>
+    <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+    <area grid_logic_tile_area="0"/>
+    <chan_width_distr>
+      <x distr="uniform" peak="1.000000"/>
+      <y distr="uniform" peak="1.000000"/>
+    </chan_width_distr>
+    <switch_block type="custom"/>
+    <connection_block input_switch_name="ipin_cblock"/>
+  </device>
+  <switchlist>
+    <switch type="mux" name="L4_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="207.9e-12" mux_trans_size="2.377" buf_size="35.69"/>
+    <!-- Delay of L16 driver is scaled from L4 by a factor of 1.5x (based on numbers from the Titan Stratix IV architecture file)
+	 Area numbers will not be totally accurate because of the same buf_size -->
+    <switch type="mux" name="L16_driver" R="0.0" Cin="0.0" Cout="0.0" Tdel="312.9e-12" mux_trans_size="2.377" buf_size="35.69"/> 
+    <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="130e-12" mux_trans_size="1.508" buf_size="11.71"/>
+  </switchlist>
+  <segmentlist>
+    <segment name="L4" freq="260" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L4_driver"/>
+      <sb type="pattern">1 1 1 1 1</sb>
+      <cb type="pattern">1 1 1 1</cb>
+    </segment>
+    <segment name="L16" freq="40" length="16" type="unidir" Rmetal="0.0" Cmetal="0.0">
+      <mux name="L16_driver"/>
+      <!-- Vias from the top of the metal stack (global layers, where the long wires are 
+           implemented) down to the middle/bottom of the metal stack (semi-global layers, 
+           where the short wires are implemented) are expensive and restrictive.
+           As a result Startix IV only places long wire switch blocks every 4 LABs -->
+      <sb type="pattern">1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1</sb>
+      <!-- For the same reasons, long wires do not connect to block pins in Stratix IV -->
+      <cb type="pattern">0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0</cb>
+    </segment>
+  </segmentlist>
+  <directlist>
+    <!-- Direct connect from one LAB to the LAB directly below it (carry chain) -->
+    <direct name="adder_carry" from_pin="clb.cout" to_pin="clb.cin" x_offset="0" y_offset="-1" z_offset="0"/>
+    <!-- Direct connect from one DSP to the DSP directly below it -->
+    <direct name="dsp_out_chain" from_pin="dsp_top.chainout" from_side="bottom" to_pin="dsp_top.chainin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+    <direct name="dsp_in_chain" from_pin="dsp_top.scanout" from_side="bottom" to_pin="dsp_top.scanin" to_side="top" x_offset="0" y_offset="-4" z_offset="0"/>
+  </directlist>
+  <complexblocklist>
+    <!-- Define I/O pads begin -->
+    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+    <pb_type name="io">
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <clock name="clock" num_pins="1"/>
+      <!-- IOs can operate as either inputs or outputs.
+	     Delays below come from Ian Kuon. They are small, so they should be interpreted as
+	     the delays to and from registers in the I/O (and generally I/Os are registered 
+	     today and that is when you timing analyze them.
+	     -->
+      <mode name="inpad">
+        <pb_type name="inpad" blif_model=".input" num_pb="1">
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="outpad">
+        <pb_type name="outpad" blif_model=".output" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+      <!-- Place I/Os on the sides of the FPGA -->
+    </pb_type>
+    <!-- Define I/O pads ends -->
+    <!-- Define general purpose logic block (CLB) begin -->
+    <pb_type name="clb">
+      <input name="I1" num_pins="15" equivalent="full"/>
+      <input name="I2" num_pins="15" equivalent="full"/>
+      <input name="I3" num_pins="15" equivalent="full"/>
+      <input name="I4" num_pins="15" equivalent="full"/>
+      <input name="cin" num_pins="1"/>
+      <output name="O" num_pins="40" equivalent="none"/>
+      <output name="cout" num_pins="1"/>
+      <clock name="clk" num_pins="1"/>
+      <pb_type name="lab" num_pb="1">
+        <input name="I1" num_pins="15"/>
+        <input name="I2" num_pins="15"/>
+        <input name="I3" num_pins="15"/>
+        <input name="I4" num_pins="15"/>
+        <input name="cin" num_pins="1"/>
+        <output name="O" num_pins="40"/>
+        <output name="cout" num_pins="1"/>
+        <clock name="clk" num_pins="1"/>
+        <!-- Describe fracturable logic element.  
+                 Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
+                 The outputs of the fracturable logic element can be optionally registered
+            -->
+        <pb_type name="fle" num_pb="10">
+          <input name="in" num_pins="8"/>
+          <input name="cin" num_pins="1"/>
+          <output name="out" num_pins="4"/>
+          <output name="cout" num_pins="1"/>
+          <clock name="clk" num_pins="1"/>
+          <!-- 
+                    The ALM inputs are as follows:
+                            A -> fle[0]
+                            B -> fle[1]
+                            C -> fle[2]
+                            D -> fle[3]
+                            E -> fle[4]
+                            F -> fle[5]
+                            G -> fle[6]
+                            H -> fle[7]
+              -->
+          <mode name="n2_lut5">
+            <pb_type name="ble5" num_pb="2">
+              <input name="in" num_pins="5"/>
+              <input name="cin" num_pins="1"/>
+              <output name="out" num_pins="2"/>
+              <output name="cout" num_pins="1"/>
+              <clock name="clk" num_pins="1"/>
+              <mode name="blut5">
+                <pb_type name="flut5" num_pb="1">
+                  <input name="in" num_pins="5"/>
+                  <output name="out" num_pins="2"/>
+                  <clock name="clk" num_pins="1"/>
+                  <!-- Regular LUT mode -->
+                  <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                    <input name="in" num_pins="5" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                             we instead take the average of these numbers to get more stable results
+                             note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have 
+                             subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                             210.96e-12
+                             206.85e-12
+                             143.46e-12
+                             136.94e-12
+                             68.12e-12
+                          -->
+                    <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                            153.27e-12
+                            153.27e-12
+                            153.27e-12
+                            153.27e-12
+                            153.27e-12
+                        </delay_matrix>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="lut5_in" input="flut5.in" output="lut5.in"/>
+                    <direct name="reg_in" input="flut5.in[0]" output="ff[0].D"/>
+                    <direct name="lut5_ff" input="lut5.out" output="ff[1].D">
+                      <delay_constant max="18.96e-12" in_port="lut5.out" out_port="ff[1].D"/>
+                      <pack_pattern name="ble5" in_port="lut5.out" out_port="ff[1].D"/>
+                    </direct>
+                    <complete name="clock" input="flut5.clk" output="ff.clk"/>
+                    <complete name="out_mux" input="ff.Q lut5.out" output="flut5.out">
+                      <delay_constant max="39.85e-12" in_port="lut5.out" out_port="flut5.out"/>
+                      <delay_constant max="39.85e-12" in_port="ff.Q" out_port="flut5.out"/>
+                    </complete>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <direct name="direct1" input="ble5.in" output="flut5.in"/>
+                  <direct name="direct2" input="ble5.clk" output="flut5.clk"/>
+                  <direct name="direct3" input="flut5.out" output="ble5.out"/>
+                </interconnect>
+              </mode>
+              <mode name="arithmetic">
+                <pb_type name="arithmetic" num_pb="1">
+                  <input name="in" num_pins="4"/>
+                  <input name="cin" num_pins="1"/>
+                  <output name="out" num_pins="2"/>
+                  <output name="cout" num_pins="1"/>
+                  <clock name="clk" num_pins="1"/>
+                  <!-- Special dual-LUT mode that drives adder only -->
+                  <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut">
+                    <input name="in" num_pins="4" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have 
+                           subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                             168.12e-12
+                             164.02e-12
+                             100.63e-12
+                             94.11e-12
+                          -->
+                    <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                            131.72e-12
+                            131.72e-12
+                            131.72e-12
+                            131.72e-12
+                        </delay_matrix>
+                  </pb_type>
+                  <pb_type name="adder" blif_model=".subckt adder" num_pb="1">
+                    <input name="a" num_pins="1"/>
+                    <input name="b" num_pins="1"/>
+                    <input name="cin" num_pins="1"/>
+                    <output name="cout" num_pins="1"/>
+                    <output name="sumout" num_pins="1"/>
+                    <delay_constant max="68.74e-12" in_port="adder.a" out_port="adder.sumout"/>
+                    <delay_constant max="68.74e-12" in_port="adder.b" out_port="adder.sumout"/>
+                    <delay_constant max="35.46e-12" in_port="adder.cin" out_port="adder.sumout"/>
+                    <delay_constant max="49.32e-12" in_port="adder.a" out_port="adder.cout"/>
+                    <delay_constant max="49.32e-12" in_port="adder.b" out_port="adder.cout"/>
+                    <delay_constant max="25.56e-12" in_port="adder.cin" out_port="adder.cout"/>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="clock" input="arithmetic.clk" output="ff.clk"/>
+                    <direct name="lut4_in1" input="arithmetic.in" output="lut4[0].in"/>
+                    <direct name="lut4_in2" input="arithmetic.in" output="lut4[1].in"/>
+                    <direct name="lut_to_add1" input="lut4[0:0].out" output="adder.a"/>
+                    <direct name="lut_to_add2" input="lut4[1:1].out" output="adder.b"/>
+                    <direct name="add_to_ff" input="adder.sumout" output="ff.D">
+                      <delay_constant max="18.96e-12" in_port="adder.sumout" out_port="ff.D"/>
+                      <!--pack_pattern name="chain" in_port="adder.sumout" out_port="ff.D"/-->
+                    </direct>
+                    <direct name="carry_in" input="arithmetic.cin" output="adder.cin">
+                      <pack_pattern name="chain" in_port="arithmetic.cin" out_port="adder.cin"/>
+                    </direct>
+                    <direct name="carry_out" input="adder.cout" output="arithmetic.cout">
+                      <pack_pattern name="chain" in_port="adder.cout" out_port="arithmetic.cout"/>
+                    </direct>
+                    <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out">
+                      <delay_constant max="39.85e-12" in_port="adder.sumout" out_port="arithmetic.out"/>
+                      <delay_constant max="39.85e-12" in_port="ff.Q" out_port="arithmetic.out"/>
+                    </complete>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <direct name="direct1" input="ble5.in[3:0]" output="arithmetic.in"/>
+                  <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                    <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                  </direct>
+                  <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                    <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                  </direct>
+                  <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                  <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                </interconnect>
+              </mode>
+            </pb_type>
+            <interconnect>
+              <!-- Shared inputs between the two 5-LUTs -->
+              <complete name="lut5_reg1" input="fle.in[0]" output="ble5[0].in[0] ble5[1].in[1]"/>
+              <complete name="lut5_reg2" input="fle.in[1]" output="ble5[0].in[1] ble5[1].in[0]"/>
+              <!-- Rest of the 5-LUT inputs -->
+              <direct name="lut5_inputs_1" input="fle.in[4:2]" output="ble5[0].in[4:2]"/>
+              <direct name="lut5_inputs_22" input="fle.in[7:5]" output="ble5[1].in[4:2]"/>
+              <direct name="lut5_outputs_1" input="ble5[0].out" output="fle.out[1:0]"/>
+              <direct name="lut5_outputs_2" input="ble5[1].out" output="fle.out[3:2]"/>
+              <direct name="carry_in" input="fle.cin" output="ble5[0].cin">
+                <pack_pattern name="chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+              </direct>
+              <direct name="carry_out" input="ble5[1].cout" output="fle.cout">
+                <pack_pattern name="chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+              </direct>
+              <direct name="carry_link" input="ble5[0].cout" output="ble5[1].cin">
+                <pack_pattern name="chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+              </direct>
+              <complete name="clock" input="fle.clk" output="ble5[1:0].clk"/>
+            </interconnect>
+          </mode>
+          <!-- n2_lut5 -->
+          <mode name="n1_lut6">
+            <pb_type name="ble6" num_pb="1">
+              <input name="in" num_pins="6"/>
+              <output name="out" num_pins="4"/>
+              <clock name="clk" num_pins="1"/>
+              <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+                <input name="in" num_pins="6" port_class="lut_in"/>
+                <output name="out" num_pins="1" port_class="lut_out"/>
+                <!-- LUT timing using delay matrix -->
+                <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           257.8e-12
+                           253.69e-12
+                           190.3e-12
+                           183.78e-12
+                           114.96e-12
+                           77.18e-12
+                      -->
+                <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                        179.6e-12
+                        179.6e-12
+                        179.6e-12
+                        179.6e-12
+                        179.6e-12
+                        179.6e-12
+                    </delay_matrix>
+              </pb_type>
+              <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                <input name="D" num_pins="1" port_class="D"/>
+                <output name="Q" num_pins="1" port_class="Q"/>
+                <clock name="clk" num_pins="1" port_class="clock"/>
+                <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+              </pb_type>
+              <interconnect>
+                <direct name="lut6_inputs" input="ble6.in" output="lut6.in"/>
+                <direct name="lut6_ff" input="lut6.out" output="ff[1].D">
+                  <delay_constant max="18.96e-12" in_port="lut6.out" out_port="ff[1].D"/>
+                  <pack_pattern name="ble6" in_port="lut6.out" out_port="ff[1].D"/>
+                </direct>
+                <complete name="clock" input="ble6.clk" output="ff.clk"/>
+                <direct name="input_to_ff" input="ble6.in[0]" output="ff[0].D"/>
+                <mux name="mux1" input="ff[0].Q lut6.out" output="ble6.out[0]">
+                  <delay_constant max="39.85e-12" in_port="lut6.out" out_port="ble6.out[0]"/>
+                  <delay_constant max="39.85e-12" in_port="ff[0].Q" out_port="ble6.out[0]"/>
+                </mux>
+                <!-- This mux is the same as mux1 but connected to output 2 -->
+                <mux name="mux2" input="ff[0].Q lut6.out" output="ble6.out[1]">
+                  <delay_constant max="39.85e-12" in_port="lut6.out" out_port="ble6.out[1]"/>
+                  <delay_constant max="39.85e-12" in_port="ff[0].Q" out_port="ble6.out[1]"/>
+                </mux>
+                <mux name="mux3" input="ff[1].Q lut6.out" output="ble6.out[2]">
+                  <delay_constant max="39.85e-12" in_port="lut6.out" out_port="ble6.out[2]"/>
+                  <delay_constant max="39.85e-12" in_port="ff[1].Q" out_port="ble6.out[2]"/>
+                </mux>
+                <!-- This mux is the same as mux2 but connected to output 3 -->
+                <mux name="mux4" input="ff[1].Q lut6.out" output="ble6.out[3]">
+                  <delay_constant max="39.85e-12" in_port="lut6.out" out_port="ble6.out[3]"/>
+                  <delay_constant max="39.85e-12" in_port="ff[1].Q" out_port="ble6.out[3]"/>
+                </mux>
+              </interconnect>
+            </pb_type>
+            <interconnect>
+              <!-- ble6 takes inputs A, B, C, D, E, & F; where F is fle[7] -->
+              <direct name="lut6_inputs1" input="fle.in[4:0]" output="ble6.in[4:0]"/>
+              <direct name="lut6_inputs2" input="fle.in[7]" output="ble6.in[5]"/>
+              <direct name="direct2" input="ble6.out" output="fle.out"/>
+              <direct name="direct4" input="fle.clk" output="ble6.clk"/>
+            </interconnect>
+          </mode>
+          <!-- n1_lut6 -->
+        </pb_type>
+        <interconnect>
+          <!-- 50% sparsely populated local routing -->
+          <!-- This 50% sparsity pattern divides the cluster inputs and local feedbacks into four groups, 
+               and then selects two of the four groups to feed each LUT input. This means half of the cluster 
+               inputs and local feedbacks can feed each LUT input. There is partial overlap in the inputs that 
+               feed the various LUT inputs, which helps routability vs. simply having half the cluster inputs 
+               feed one set of half the LUT inputs and the other half of cluster inputs feed the other set of 
+               LUT inputs. This pattern is used by Stratix (I - 10) architectures. -->
+          <complete name="lutA" input="lab.I4 lab.I3" output="fle[9:0].in[0:0]">
+            <delay_constant max="74.71e-12" in_port="lab.I4" out_port="fle.in[0:0]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I3" out_port="fle.in[0:0]"/>
+          </complete>
+          <complete name="lutB" input="lab.I3 lab.I2" output="fle[9:0].in[1:1]">
+            <delay_constant max="74.71e-12" in_port="lab.I3" out_port="fle.in[1:1]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I2" out_port="fle.in[1:1]"/>
+          </complete>
+          <complete name="lutC" input="lab.I2 lab.I1" output="fle[9:0].in[2:2]">
+            <delay_constant max="74.71e-12" in_port="lab.I2" out_port="fle.in[2:2]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I1" out_port="fle.in[2:2]"/>
+          </complete>
+          <complete name="lutD" input="lab.I4 lab.I2" output="fle[9:0].in[3:3]">
+            <delay_constant max="74.71e-12" in_port="lab.I4" out_port="fle.in[3:3]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I2" out_port="fle.in[3:3]"/>
+          </complete>
+          <complete name="lutE" input="lab.I3 lab.I1" output="fle[9:0].in[4:4]">
+            <delay_constant max="74.71e-12" in_port="lab.I3" out_port="fle.in[4:4]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I1" out_port="fle.in[4:4]"/>
+          </complete>
+          <complete name="lutF" input="lab.I4 lab.I1" output="fle[9:0].in[5:5]">
+            <delay_constant max="74.71e-12" in_port="lab.I4" out_port="fle.in[5:5]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I1" out_port="fle.in[5:5]"/>
+          </complete>
+          <complete name="lutG" input="lab.I4 lab.I3" output="fle[9:0].in[6:6]">
+            <delay_constant max="74.71e-12" in_port="lab.I4" out_port="fle.in[6:6]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I3" out_port="fle.in[6:6]"/>
+          </complete>
+          <complete name="lutH" input="lab.I3 lab.I2" output="fle[9:0].in[7:7]">
+            <delay_constant max="74.71e-12" in_port="lab.I3" out_port="fle.in[7:7]"/>
+            <delay_constant max="74.71e-12" in_port="lab.I2" out_port="fle.in[7:7]"/>
+          </complete>
+          <complete name="clks" input="lab.clk" output="fle[9:0].clk"/>
+          <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
+                     By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+                     then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+                     naive specification).
+              -->
+          <direct name="labouts1" input="fle[9:0].out[0]" output="lab.O[9:0]"/>
+          <direct name="labouts2" input="fle[9:0].out[1]" output="lab.O[19:10]"/>
+          <direct name="labouts3" input="fle[9:0].out[2]" output="lab.O[29:20]"/>
+          <direct name="labouts4" input="fle[9:0].out[3]" output="lab.O[39:30]"/>
+          <!-- Carry chain links -->
+          <direct name="carry_in" input="lab.cin" output="fle[0:0].cin">
+            <!-- Put all inter-block carry chain delay on this one edge -->
+            <delay_constant max="18.47e-12" in_port="lab.cin" out_port="fle[0:0].cin"/>
+            <pack_pattern name="chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+          </direct>
+          <direct name="carry_out" input="fle[9:9].cout" output="lab.cout">
+            <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+          </direct>
+          <direct name="carry_link" input="fle[8:0].cout" output="fle[9:1].cin">
+            <pack_pattern name="chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+          </direct>
+        </interconnect>
+      </pb_type>
+      <interconnect>
+        <direct name="carry_in" input="clb.cin" output="lab.cin"/>
+        <direct name="carry_out" input="lab.cout" output="clb.cout"/>
+        <direct name="clock" input="clb.clk" output="lab.clk"/>
+        <complete name="Input_feedback_I1" input="clb.I1 lab.O[4:0]" output="lab.I1"/>
+        <complete name="Input_feedback_I2" input="clb.I2 lab.O[24:20]" output="lab.I2"/>
+        <complete name="Input_feedback_I3" input="clb.I3 lab.O[9:5]" output="lab.I3"/>
+        <complete name="Input_feedback_I4" input="clb.I4 lab.O[29:25]" output="lab.I4"/>
+        <!--
+        <direct name="Input_I1" input="clb.I1" output="lab.I1"/>
+        <direct name="Input_I2" input="clb.I2" output="lab.I2"/>
+        <direct name="Input_I3" input="clb.I3" output="lab.I3"/>
+        <direct name="Input_I4" input="clb.I4" output="lab.I4"/>
+        -->
+        <direct name="output" input="lab.O" output="clb.O"/>
+      </interconnect>
+    </pb_type>
+    <!-- Define general purpose logic block (CLB) ends -->
+
+    <!-- Define DSP slice begin -->
+    <pb_type name="dsp_top">
+      <input name="reset" num_pins="1" is_non_clock_global="true"/>
+      <input name="dsp_I1" num_pins="64" />
+      <input name="dsp_I2" num_pins="64" />
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+    <pb_type name="dsp" num_pb="1">
+      <input name="reset" num_pins="1"/>
+      <input name="dsp_I1" num_pins="64"/>
+      <input name="dsp_I2" num_pins="64"/>
+      <input name="chainin" num_pins="64"/>
+      <input name="scanin" num_pins="27"/>
+      <output name="result" num_pins="74"/>
+      <output name="chainout" num_pins="64"/>
+      <output name="scanout" num_pins="27"/>
+      <clock name="clk" num_pins="1"/>
+
+      <pb_type name="dsp_pb" num_pb="1">
+        <input name="reset" num_pins="1"/>
+        <input name="mode_sigs" num_pins="12"/>
+        <input name="datain" num_pins="116"/>
+        <input name="chainin" num_pins="64"/>
+        <input name="scanin" num_pins="27"/>
+        <output name="result" num_pins="74"/>
+        <output name="chainout" num_pins="64"/>
+        <output name="scanout" num_pins="27"/>
+        <clock name="clk" num_pins="1"/>
+
+        <!-- fixed-point multiplier mode (1 27x27 multiplier) result = ax*ay -->
+        <mode name="one_mult_27x27">
+          <pb_type name="one_mult_27x27" num_pb="1">
+            <input name="a" num_pins="27"/>
+            <input name="b" num_pins="27"/>
+            <output name="out" num_pins="54"/>
+            <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="27"/>
+              <input name="b" num_pins="27"/>
+              <output name="out" num_pins="54"/>
+              <delay_constant max="2.14e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+              <delay_constant max="2.14e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+              </direct>
+              <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+              </direct>
+              <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+              </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a" input="dsp_pb.datain[26:0]" output="one_mult_27x27.a">
+            </direct>
+            <direct name="datain2b" input="dsp_pb.datain[53:27]" output="one_mult_27x27.b">
+            </direct>
+            <direct name="out2dataout" input="one_mult_27x27.out" output="dsp_pb.result[53:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier mode (2 18x19 multipliers) result[some:bits] = ax*ay, result[other:bits] = bx*by -->
+        <mode name="two_mult_18x19">
+          <pb_type name="two_mult_18x19" num_pb="2">
+            <input name="a" num_pins="18"/>
+            <input name="b" num_pins="19"/>
+            <output name="out" num_pins="37"/>
+            <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+              <input name="a" num_pins="18"/>
+              <input name="b" num_pins="19"/>
+              <output name="out" num_pins="37"/>
+              <delay_constant max="2.14e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+              <delay_constant max="2.14e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+                 </direct>
+              <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+                 </direct>
+              <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+                 </direct>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="datain2a1" input="dsp_pb.datain[17:0]" output="two_mult_18x19[0].a">
+            </direct>
+            <direct name="datain2b1" input="dsp_pb.datain[36:18]" output="two_mult_18x19[0].b">
+            </direct>
+            <direct name="datain2a2" input="dsp_pb.datain[54:37]" output="two_mult_18x19[1].a">
+            </direct>
+            <direct name="datain2b2" input="dsp_pb.datain[73:55]" output="two_mult_18x19[1].b">
+            </direct>
+            <direct name="out2result" input="two_mult_18x19.out" output="dsp_pb.result[73:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- Fixed point multiplier mode (4 9x9 multipliers) result = ax*ay -->
+        <mode name="mult_9x9_fixed_pt_mode">
+          <pb_type name="mult_9x9_fixed_pt" blif_model=".subckt multiply" num_pb="4">
+            <input name="a" num_pins="9"/>
+            <input name="b" num_pins="9"/>
+            <output name="out" num_pins="18"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_9x9_fixed_pt.a" out_port="mult_9x9_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mult_9x9_fixed_pt.b" out_port="mult_9x9_fixed_pt.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="atoa0" input="dsp_pb.datain[8:0]" output="mult_9x9_fixed_pt[0].a"/>
+            <direct name="btob0" input="dsp_pb.datain[17:9]" output="mult_9x9_fixed_pt[0].b"/>
+            <direct name="atoa1" input="dsp_pb.datain[26:18]" output="mult_9x9_fixed_pt[1].a"/>
+            <direct name="btob1" input="dsp_pb.datain[35:27]" output="mult_9x9_fixed_pt[1].b"/>
+            <direct name="atoa2" input="dsp_pb.datain[44:36]" output="mult_9x9_fixed_pt[2].a"/>
+            <direct name="btob2" input="dsp_pb.datain[53:45]" output="mult_9x9_fixed_pt[2].b"/>
+            <direct name="atoa3" input="dsp_pb.datain[62:54]" output="mult_9x9_fixed_pt[3].a"/>
+            <direct name="btob3" input="dsp_pb.datain[71:63]" output="mult_9x9_fixed_pt[3].b"/>
+            <direct name="sumouttosumout0" input="mult_9x9_fixed_pt[0].out" output="dsp_pb.result[17:0]"/>
+            <direct name="sumouttosumout1" input="mult_9x9_fixed_pt[1].out" output="dsp_pb.result[35:18]"/>
+            <direct name="sumouttosumout2" input="mult_9x9_fixed_pt[2].out" output="dsp_pb.result[53:36]"/>
+            <direct name="sumouttosumout3" input="mult_9x9_fixed_pt[3].out" output="dsp_pb.result[71:54]"/>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_2_mode">
+          <pb_type name="sop_2" num_pb="1" blif_model=".subckt int_sop_2">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="18"/>
+            <input name="by" num_pins="19"/>
+            <input name="chainin" num_pins="37"/>
+            <output name="result" num_pins="37"/>
+            <output name="chainout" num_pins="37"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_2.reset" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.mode_sigs" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.ax" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.ay" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.bx" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.by" out_port="sop_2.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.chainin" out_port="sop_2.result"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_2.reset" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.mode_sigs" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.ax" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.ay" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.bx" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.by" out_port="sop_2.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_2.chainin" out_port="sop_2.chainout"/>
+
+            <T_setup value="18.91e-12" port="sop_2.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.ax" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.ay" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.bx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.by" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_2.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.ax" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.ay" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.bx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.by" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_2.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_2.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_2.reset">
+            </direct>
+            <direct name="modesigs" input="dsp_pb.mode_sigs" output="sop_2.mode_sigs">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="sop_2.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="sop_2.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[54:37]" output="sop_2.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[73:55]" output="sop_2.by">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[36:0]" output="sop_2.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_2.result" output="dsp_pb.result[36:0]">
+            </direct>
+            <direct name="chainout" input="sop_2.chainout" output="dsp_pb.chainout[36:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (ax * ay) + bx + chainin. chainout = result. with scanin-scanout support -->
+        <mode name="mult_add_mode_18_19_36">
+          <pb_type name="mult_add" num_pb="1" blif_model=".subckt mult_add_int_18x19">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="ax" num_pins="18"/>
+            <input name="ay" num_pins="19"/>
+            <input name="bx" num_pins="36"/>
+            <input name="chainin" num_pins="64"/>
+            <input name="scanin" num_pins="19"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <output name="scanout" num_pins="19"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.result"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.chainout"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.scanout"/>
+
+            <T_setup value="18.91e-12" port="mult_add.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.ax" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.ay" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.bx" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.scanin" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.ax" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.ay" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.bx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.scanin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="mult_add.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="mult_add.reset">
+            </direct>
+            <direct name="modesigs" input="dsp_pb.mode_sigs" output="mult_add.mode_sigs">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[17:0]" output="mult_add.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[36:18]" output="mult_add.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[72:37]" output="mult_add.bx">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="mult_add.chainin">
+            </direct>
+            <direct name="scanin"   input="dsp_pb.scanin[18:0]" output="mult_add.scanin">
+            </direct>
+            <direct name="dataout2result" input="mult_add.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="mult_add.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+            <direct name="scanout" input="mult_add.scanout" output="dsp_pb.scanout[18:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point multiplier-add-sum mode result = (ax * ay) + bx + chainin. chainout = result. with scanin-scanout support -->
+        <mode name="mult_add_mode_27_27_64">
+          <pb_type name="mult_add" num_pb="1" blif_model=".subckt mult_add_int_27x27">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="ax" num_pins="27"/>
+            <input name="ay" num_pins="27"/>
+            <input name="bx" num_pins="36"/>
+            <input name="chainin" num_pins="64"/>
+            <input name="scanin" num_pins="27"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <output name="scanout" num_pins="27"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.result"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.result"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.chainout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.chainout"/>
+
+            <delay_constant max="2.14e-9" in_port="mult_add.reset" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.mode_sigs" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ax" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.ay" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.bx" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.chainin" out_port="mult_add.scanout"/>
+            <delay_constant max="2.14e-9" in_port="mult_add.scanin" out_port="mult_add.scanout"/>
+
+            <T_setup value="18.91e-12" port="mult_add.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.ax" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.ay" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.bx" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.scanin" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.ax" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.ay" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.bx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.scanin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="mult_add.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="mult_add.reset">
+            </direct>
+            <direct name="modesigs" input="dsp_pb.mode_sigs" output="mult_add.mode_sigs">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[26:0]" output="mult_add.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[53:27]" output="mult_add.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[89:54]" output="mult_add.bx">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="mult_add.chainin">
+            </direct>
+            <direct name="scanin"   input="dsp_pb.scanin[26:0]" output="mult_add.scanin">
+            </direct>
+            <direct name="dataout2result" input="mult_add.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="mult_add.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+            <direct name="scanout" input="mult_add.scanout" output="dsp_pb.scanout[26:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point sum-of-4 mode result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin. chainout = result -->
+        <mode name="sop_4_mode">
+          <pb_type name="sop_4" num_pb="1" blif_model=".subckt int_sop_4">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="ax" num_pins="9"/>
+            <input name="ay" num_pins="9"/>
+            <input name="bx" num_pins="9"/>
+            <input name="by" num_pins="9"/>
+            <input name="cx" num_pins="9"/>
+            <input name="cy" num_pins="9"/>
+            <input name="dx" num_pins="9"/>
+            <input name="dy" num_pins="9"/>
+            <input name="chainin" num_pins="64"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_4.reset" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.mode_sigs" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ax" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ay" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.bx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.by" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cy" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dy" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.chainin" out_port="sop_4.result"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_4.reset" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.mode_sigs" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ax" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ay" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.bx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.by" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cy" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dy" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.chainin" out_port="sop_4.chainout"/>
+
+            <T_setup value="18.91e-12" port="sop_4.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.ax" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.ay" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.bx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.by" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.cx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.cy" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.dx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.dy" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.ax" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.ay" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.bx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.by" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.cx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.cy" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.dx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.dy" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_4.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_4.reset">
+            </direct>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="sop_4.mode_sigs">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[8:0]" output="sop_4.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[17:9]" output="sop_4.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[26:18]" output="sop_4.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[35:27]" output="sop_4.by">
+            </direct>
+            <direct name="datain2cx" input="dsp_pb.datain[44:36]" output="sop_4.cx">
+            </direct>
+            <direct name="datain2cy" input="dsp_pb.datain[53:45]" output="sop_4.cy">
+            </direct>
+            <direct name="datain2dx" input="dsp_pb.datain[62:54]" output="sop_4.dx">
+            </direct>
+            <direct name="datain2dy" input="dsp_pb.datain[71:63]" output="sop_4.dy">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="sop_4.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_4.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="sop_4.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- fixed-point sum-of-4 accum mode result = (dx * dy) + (cx * cy) + (bx * by) + (ax * ay) + chainin + accumulator. chainout = result -->
+        <mode name="sop_4_accum_mode">
+          <pb_type name="sop_4" num_pb="1" blif_model=".subckt int_sop_accum_4">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="ax" num_pins="9"/>
+            <input name="ay" num_pins="9"/>
+            <input name="bx" num_pins="9"/>
+            <input name="by" num_pins="9"/>
+            <input name="cx" num_pins="9"/>
+            <input name="cy" num_pins="9"/>
+            <input name="dx" num_pins="9"/>
+            <input name="dy" num_pins="9"/>
+            <input name="chainin" num_pins="64"/>
+            <output name="result" num_pins="64"/>
+            <output name="chainout" num_pins="64"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_4.mode_sigs" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ax" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ay" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.bx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.by" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cy" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dx" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dy" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.chainin" out_port="sop_4.result"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.reset" out_port="sop_4.result"/>
+
+            <delay_constant max="2.14e-9" in_port="sop_4.mode_sigs" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ax" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.ay" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.bx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.by" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.cy" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dx" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.dy" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.chainin" out_port="sop_4.chainout"/>
+            <delay_constant max="2.14e-9" in_port="sop_4.reset" out_port="sop_4.chainout"/>
+
+            <T_setup value="18.91e-12" port="sop_4.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.ax" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.ay" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.bx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.by" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.cx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.cy" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.dx" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.dy" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="sop_4.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.ax" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.ay" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.bx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.by" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.cx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.cy" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.dx" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.dy" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="sop_4.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="sop_4.clk">
+            </direct>
+            <direct name="reset" input="dsp_pb.reset" output="sop_4.reset">
+            </direct>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="sop_4.mode_sigs">
+            </direct>
+            <direct name="datain2ax" input="dsp_pb.datain[8:0]" output="sop_4.ax">
+            </direct>
+            <direct name="datain2ay" input="dsp_pb.datain[17:9]" output="sop_4.ay">
+            </direct>
+            <direct name="datain2bx" input="dsp_pb.datain[26:18]" output="sop_4.bx">
+            </direct>
+            <direct name="datain2by" input="dsp_pb.datain[35:27]" output="sop_4.by">
+            </direct>
+            <direct name="datain2cx" input="dsp_pb.datain[44:36]" output="sop_4.cx">
+            </direct>
+            <direct name="datain2cy" input="dsp_pb.datain[53:45]" output="sop_4.cy">
+            </direct>
+            <direct name="datain2dx" input="dsp_pb.datain[62:54]" output="sop_4.dx">
+            </direct>
+            <direct name="datain2dy" input="dsp_pb.datain[71:63]" output="sop_4.dy">
+            </direct>
+            <direct name="chainin"   input="dsp_pb.chainin[63:0]" output="sop_4.chainin">
+            </direct>
+            <direct name="dataout2result" input="sop_4.result" output="dsp_pb.result[63:0]">
+            </direct>
+            <direct name="chainout" input="sop_4.chainout" output="dsp_pb.chainout[63:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- Fixed point mac mode (result = a*b + accumulated value-->
+        <mode name="mac_27x27_fixed_pt_mode">
+          <pb_type name="mac_27x27_fixed_pt" blif_model=".subckt mac_int_27x27" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="a" num_pins="27"/>
+            <input name="b" num_pins="27"/>
+            <output name="out" num_pins="54"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="mac_27x27_fixed_pt.a" out_port="mac_27x27_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_27x27_fixed_pt.b" out_port="mac_27x27_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_27x27_fixed_pt.reset" out_port="mac_27x27_fixed_pt.out"/>
+
+            <T_setup value="18.91e-12" port="mac_27x27_fixed_pt.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_27x27_fixed_pt.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_27x27_fixed_pt.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_27x27_fixed_pt.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_27x27_fixed_pt.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_27x27_fixed_pt.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_27x27_fixed_pt.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_27x27_fixed_pt.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="mac_27x27_fixed_pt.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="mac_27x27_fixed_pt.clk"/>
+            <direct name="atoa" input="dsp_pb.datain[26:0]" output="mac_27x27_fixed_pt.a"/>
+            <direct name="btob" input="dsp_pb.datain[53:27]" output="mac_27x27_fixed_pt.b"/>
+            <direct name="sumouttosumout" input="mac_27x27_fixed_pt.out" output="dsp_pb.result[53:0]"/>
+          </interconnect>
+        </mode>
+
+        <!-- Fixed point mac mode (result = a*b + accumulated value-->
+        <mode name="mac_18x19_fixed_pt_mode">
+          <pb_type name="mac_fixed_pt" blif_model=".subckt mac_int_18x19" num_pb="2">
+            <input name="reset" num_pins="1"/>
+            <input name="a" num_pins="18"/>
+            <input name="b" num_pins="19"/>
+            <output name="out" num_pins="37"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="mac_fixed_pt.a" out_port="mac_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_fixed_pt.b" out_port="mac_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_fixed_pt.reset" out_port="mac_fixed_pt.out"/>
+
+            <T_setup value="18.91e-12" port="mac_fixed_pt.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fixed_pt.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fixed_pt.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fixed_pt.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fixed_pt.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fixed_pt.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fixed_pt.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fixed_pt.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset0" input="dsp_pb.reset" output="mac_fixed_pt[0].reset"/>
+            <direct name="reset1" input="dsp_pb.reset" output="mac_fixed_pt[1].reset"/>
+            <direct name="clk0" input="dsp_pb.clk" output="mac_fixed_pt[0].clk"/>
+            <direct name="clk1" input="dsp_pb.clk" output="mac_fixed_pt[1].clk"/>
+            <direct name="atoa0" input="dsp_pb.datain[17:0]" output="mac_fixed_pt[0].a"/>
+            <direct name="btob0" input="dsp_pb.datain[36:18]" output="mac_fixed_pt[0].b"/>
+            <direct name="atoa1" input="dsp_pb.datain[54:37]" output="mac_fixed_pt[1].a"/>
+            <direct name="btob1" input="dsp_pb.datain[73:55]" output="mac_fixed_pt[1].b"/>
+            <direct name="sumouttosumout0" input="mac_fixed_pt[0].out" output="dsp_pb.result[36:0]"/>
+            <direct name="sumouttosumout1" input="mac_fixed_pt[1].out" output="dsp_pb.result[73:37]"/>
+          </interconnect>
+        </mode>
+
+        <!-- Fixed point mac mode (result = a*b + accumulated value-->
+        <mode name="mac_9x9_fixed_pt_mode">
+          <pb_type name="mac_9x9_fixed_pt" blif_model=".subckt mac_int_9x9" num_pb="4">
+            <input name="reset" num_pins="1"/>
+            <input name="a" num_pins="9"/>
+            <input name="b" num_pins="9"/>
+            <output name="out" num_pins="18"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.14e-9" in_port="mac_9x9_fixed_pt.a" out_port="mac_9x9_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_9x9_fixed_pt.b" out_port="mac_9x9_fixed_pt.out"/>
+            <delay_constant max="2.14e-9" in_port="mac_9x9_fixed_pt.reset" out_port="mac_9x9_fixed_pt.out"/>
+
+            <T_setup value="18.91e-12" port="mac_9x9_fixed_pt.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_9x9_fixed_pt.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_9x9_fixed_pt.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_9x9_fixed_pt.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_9x9_fixed_pt.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_9x9_fixed_pt.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_9x9_fixed_pt.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_9x9_fixed_pt.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset0" input="dsp_pb.reset" output="mac_9x9_fixed_pt[0].reset"/>
+            <direct name="reset1" input="dsp_pb.reset" output="mac_9x9_fixed_pt[1].reset"/>
+            <direct name="reset2" input="dsp_pb.reset" output="mac_9x9_fixed_pt[2].reset"/>
+            <direct name="reset3" input="dsp_pb.reset" output="mac_9x9_fixed_pt[3].reset"/>
+            <direct name="clk0" input="dsp_pb.clk" output="mac_9x9_fixed_pt[0].clk"/>
+            <direct name="clk1" input="dsp_pb.clk" output="mac_9x9_fixed_pt[1].clk"/>
+            <direct name="clk2" input="dsp_pb.clk" output="mac_9x9_fixed_pt[2].clk"/>
+            <direct name="clk3" input="dsp_pb.clk" output="mac_9x9_fixed_pt[3].clk"/>
+            <direct name="atoa0" input="dsp_pb.datain[8:0]" output="mac_9x9_fixed_pt[0].a"/>
+            <direct name="btob0" input="dsp_pb.datain[17:9]" output="mac_9x9_fixed_pt[0].b"/>
+            <direct name="atoa1" input="dsp_pb.datain[26:18]" output="mac_9x9_fixed_pt[1].a"/>
+            <direct name="btob1" input="dsp_pb.datain[35:27]" output="mac_9x9_fixed_pt[1].b"/>
+            <direct name="atoa2" input="dsp_pb.datain[44:36]" output="mac_9x9_fixed_pt[2].a"/>
+            <direct name="btob2" input="dsp_pb.datain[53:45]" output="mac_9x9_fixed_pt[2].b"/>
+            <direct name="atoa3" input="dsp_pb.datain[62:54]" output="mac_9x9_fixed_pt[3].a"/>
+            <direct name="btob3" input="dsp_pb.datain[71:63]" output="mac_9x9_fixed_pt[3].b"/>
+            <direct name="sumouttosumout0" input="mac_9x9_fixed_pt[0].out" output="dsp_pb.result[17:0]"/>
+            <direct name="sumouttosumout1" input="mac_9x9_fixed_pt[1].out" output="dsp_pb.result[35:18]"/>
+            <direct name="sumouttosumout2" input="mac_9x9_fixed_pt[2].out" output="dsp_pb.result[53:36]"/>
+            <direct name="sumouttosumout3" input="mac_9x9_fixed_pt[3].out" output="dsp_pb.result[71:54]"/>
+          </interconnect>
+        </mode>
+
+        <!-- floating point multiplier mode (result = a * b)-->
+        <mode name="mult_fp32_mode">
+          <pb_type name="mult_fp32" blif_model=".subckt mult_fp_32" num_pb="1">
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <output name="out" num_pins="32"/>
+
+            <delay_constant max="2.56e-9" in_port="mult_fp32.a" out_port="mult_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="mult_fp32.b" out_port="mult_fp32.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="a2a" input="dsp_pb.datain[31:0]" output="mult_fp32.a">
+            </direct>
+            <direct name="b2b" input="dsp_pb.datain[63:32]" output="mult_fp32.b">
+            </direct>
+            <direct name="out2out" input="mult_fp32.out" output="dsp_pb.result[31:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point multiplier mode (result = a * b)-->
+        <mode name="mult_fp16_mode">
+          <pb_type name="mult_fp16" blif_model=".subckt mult_fp_16" num_pb="2">
+            <input name="a" num_pins="16"/>
+            <input name="b" num_pins="16"/>
+            <output name="out" num_pins="16"/>
+
+            <delay_constant max="2.56e-9" in_port="mult_fp16.a" out_port="mult_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="mult_fp16.b" out_port="mult_fp16.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="a2a0" input="dsp_pb.datain[15:0]" output="mult_fp16[0].a"></direct>
+            <direct name="a2a1" input="dsp_pb.datain[31:16]" output="mult_fp16[1].a"></direct>
+            <direct name="b2b0" input="dsp_pb.datain[47:32]" output="mult_fp16[0].b"></direct>
+            <direct name="b2b1" input="dsp_pb.datain[63:48]" output="mult_fp16[1].b"></direct>
+            <direct name="out2out0" input="mult_fp16[0].out" output="dsp_pb.result[15:0]"></direct>
+            <direct name="out2out1" input="mult_fp16[1].out" output="dsp_pb.result[31:16]"></direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point adder mode (result = a + b)-->
+        <mode name="adder_fp32_mode"> 
+          <pb_type name="adder_fp32" blif_model=".subckt addition_fp_32" num_pb="1">
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <output name="out" num_pins="32"/>
+
+            <delay_constant max="2.56e-9" in_port="adder_fp32.a" out_port="adder_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="adder_fp32.b" out_port="adder_fp32.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="atoa" input="dsp_pb.datain[31:0]" output="adder_fp32.a">
+            </direct>
+            <direct name="btob" input="dsp_pb.datain[63:32]" output="adder_fp32.b">
+            </direct>
+            <direct name="sumouttosumout" input="adder_fp32.out" output="dsp_pb.result[31:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point adder mode (result = a + b)-->
+        <mode name="adder_fp16_mode">
+          <pb_type name="adder_fp16" blif_model=".subckt addition_fp_16" num_pb="2">
+            <input name="a" num_pins="16"/>
+            <input name="b" num_pins="16"/>
+            <output name="out" num_pins="16"/>
+
+            <delay_constant max="2.56e-9" in_port="adder_fp16.a" out_port="adder_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="adder_fp16.b" out_port="adder_fp16.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="a2a0" input="dsp_pb.datain[15:0]" output="adder_fp16[0].a"></direct>
+            <direct name="a2a1" input="dsp_pb.datain[31:16]" output="adder_fp16[1].a"></direct>
+            <direct name="b2b0" input="dsp_pb.datain[47:32]" output="adder_fp16[0].b"></direct>
+            <direct name="b2b1" input="dsp_pb.datain[63:48]" output="adder_fp16[1].b"></direct>
+            <direct name="out2out0" input="adder_fp16[0].out" output="dsp_pb.result[15:0]"></direct>
+            <direct name="out2out1" input="adder_fp16[1].out" output="dsp_pb.result[31:16]"></direct>
+          </interconnect>
+        </mode>
+
+        <!-- clocked floating point multiplier mode (result = a * b)-->
+        <mode name="mult_fp32_clocked_mode">
+          <pb_type name="mult_fp32" blif_model=".subckt mult_fp_clk_32" num_pb="1">
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <output name="out" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="mult_fp32.a" out_port="mult_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="mult_fp32.b" out_port="mult_fp32.out"/>
+
+            <T_setup value="18.91e-12" port="mult_fp32.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_fp32.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_fp32.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp32.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp32.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp32.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="mult_fp32.clk"/>
+            <direct name="a2a" input="dsp_pb.datain[31:0]" output="mult_fp32.a">
+            </direct>
+            <direct name="b2b" input="dsp_pb.datain[63:32]" output="mult_fp32.b">
+            </direct>
+            <direct name="out2out" input="mult_fp32.out" output="dsp_pb.result[31:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- clocked floating point multiplier mode (result = a * b)-->
+        <mode name="mult_fp16_clocked_mode">
+          <pb_type name="mult_fp16" blif_model=".subckt mult_fp_clk_16" num_pb="2">
+            <input name="a" num_pins="16"/>
+            <input name="b" num_pins="16"/>
+            <output name="out" num_pins="16"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="mult_fp16.a" out_port="mult_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="mult_fp16.b" out_port="mult_fp16.out"/>
+
+            <T_setup value="18.91e-12" port="mult_fp16.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_fp16.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mult_fp16.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp16.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp16.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mult_fp16.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk0" input="dsp_pb.clk" output="mult_fp16[0].clk"/>
+            <direct name="clk1" input="dsp_pb.clk" output="mult_fp16[1].clk"/>
+            <direct name="a2a0" input="dsp_pb.datain[15:0]" output="mult_fp16[0].a"></direct>
+            <direct name="a2a1" input="dsp_pb.datain[31:16]" output="mult_fp16[1].a"></direct>
+            <direct name="b2b0" input="dsp_pb.datain[47:32]" output="mult_fp16[0].b"></direct>
+            <direct name="b2b1" input="dsp_pb.datain[63:48]" output="mult_fp16[1].b"></direct>
+            <direct name="out2out0" input="mult_fp16[0].out" output="dsp_pb.result[15:0]"></direct>
+            <direct name="out2out1" input="mult_fp16[1].out" output="dsp_pb.result[31:16]"></direct>
+          </interconnect>
+        </mode>
+
+        <!-- clocked floating point adder mode (result = a + b)-->
+        <mode name="adder_fp32_clocked_mode"> 
+          <pb_type name="adder_fp32" blif_model=".subckt addition_fp_clk_32" num_pb="1">
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <output name="out" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="adder_fp32.a" out_port="adder_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="adder_fp32.b" out_port="adder_fp32.out"/>
+
+            <T_setup value="18.91e-12" port="adder_fp32.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="adder_fp32.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="adder_fp32.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp32.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp32.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp32.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="adder_fp32.clk"/>
+            <direct name="atoa" input="dsp_pb.datain[31:0]" output="adder_fp32.a">
+            </direct>
+            <direct name="btob" input="dsp_pb.datain[63:32]" output="adder_fp32.b">
+            </direct>
+            <direct name="sumouttosumout" input="adder_fp32.out" output="dsp_pb.result[31:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- clocked floating point adder mode (result = a + b)-->
+        <mode name="adder_fp16_clocked_mode">
+          <pb_type name="adder_fp16" blif_model=".subckt addition_fp_clk_16" num_pb="2">
+            <input name="a" num_pins="16"/>
+            <input name="b" num_pins="16"/>
+            <output name="out" num_pins="16"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="adder_fp16.a" out_port="adder_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="adder_fp16.b" out_port="adder_fp16.out"/>
+
+            <T_setup value="18.91e-12" port="adder_fp16.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="adder_fp16.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="adder_fp16.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp16.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp16.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="adder_fp16.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk0" input="dsp_pb.clk" output="adder_fp16[0].clk"/>
+            <direct name="clk1" input="dsp_pb.clk" output="adder_fp16[1].clk"/>
+            <direct name="a2a0" input="dsp_pb.datain[15:0]" output="adder_fp16[0].a"></direct>
+            <direct name="a2a1" input="dsp_pb.datain[31:16]" output="adder_fp16[1].a"></direct>
+            <direct name="b2b0" input="dsp_pb.datain[47:32]" output="adder_fp16[0].b"></direct>
+            <direct name="b2b1" input="dsp_pb.datain[63:48]" output="adder_fp16[1].b"></direct>
+            <direct name="out2out0" input="adder_fp16[0].out" output="dsp_pb.result[15:0]"></direct>
+            <direct name="out2out1" input="adder_fp16[1].out" output="dsp_pb.result[31:16]"></direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point mac mode (result = a*b + accumulated value-->
+        <mode name="mac_fp32_mode">
+          <pb_type name="mac_fp32" blif_model=".subckt mac_fp_32" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <output name="out" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="mac_fp32.a" out_port="mac_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="mac_fp32.b" out_port="mac_fp32.out"/>
+            <delay_constant max="2.56e-9" in_port="mac_fp32.reset" out_port="mac_fp32.out"/>
+
+            <T_setup value="18.91e-12" port="mac_fp32.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp32.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp32.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp32.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp32.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp32.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp32.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp32.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="mac_fp32.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="mac_fp32.clk"/>
+            <direct name="atoa" input="dsp_pb.datain[31:0]" output="mac_fp32.a">
+            </direct>
+            <direct name="btob" input="dsp_pb.datain[63:32]" output="mac_fp32.b">
+            </direct>
+            <direct name="sumouttosumout" input="mac_fp32.out" output="dsp_pb.result[31:0]">
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point 16-bit mac mode (result = a*b + accumulated value-->
+        <mode name="mac_fp16_mode">
+          <pb_type name="mac_fp16" blif_model=".subckt mac_fp_16" num_pb="2">
+            <input name="reset" num_pins="1"/>
+            <input name="a" num_pins="16"/>
+            <input name="b" num_pins="16"/>
+            <output name="out" num_pins="16"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="mac_fp16.a" out_port="mac_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="mac_fp16.b" out_port="mac_fp16.out"/>
+            <delay_constant max="2.56e-9" in_port="mac_fp16.reset" out_port="mac_fp16.out"/>
+
+            <T_setup value="18.91e-12" port="mac_fp16.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp16.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp16.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="mac_fp16.out" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp16.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp16.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp16.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="mac_fp16.out" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset0" input="dsp_pb.reset" output="mac_fp16[0].reset"></direct>
+            <direct name="reset1" input="dsp_pb.reset" output="mac_fp16[1].reset"></direct>
+            <direct name="clk0" input="dsp_pb.clk" output="mac_fp16[0].clk"></direct>
+            <direct name="clk1" input="dsp_pb.clk" output="mac_fp16[1].clk"></direct>
+            <direct name="atoa0" input="dsp_pb.datain[15:0]"  output="mac_fp16[0].a"></direct>
+            <direct name="atoa1" input="dsp_pb.datain[31:16]" output="mac_fp16[1].a"></direct>
+            <direct name="btob0" input="dsp_pb.datain[47:32]" output="mac_fp16[0].b"></direct>
+            <direct name="btob1" input="dsp_pb.datain[63:48]" output="mac_fp16[1].b"></direct>
+            <direct name="sumouttosumout0" input="mac_fp16[0].out" output="dsp_pb.result[15:0]"></direct>
+            <direct name="sumouttosumout1" input="mac_fp16[1].out" output="dsp_pb.result[31:16]"></direct>
+          </interconnect>
+        </mode>
+
+        <!-- floating point fp16 sum-of-2 mult mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b). chainout = third_inp or result-->
+        <mode name="fp16_sum_of_products_mode"> 
+          <pb_type name="fp16_sum_of_2_mult" blif_model=".subckt fp16_mult_add" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="top_a" num_pins="16"/>
+            <input name="top_b" num_pins="16"/>
+            <input name="bot_a" num_pins="16"/>
+            <input name="bot_b" num_pins="16"/>
+            <input name="fp32_in" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.reset" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.mode_sigs" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_a" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_b" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_a" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_b" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.fp32_in" out_port="fp16_sum_of_2_mult.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.reset" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.mode_sigs" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_a" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_b" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_a" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_b" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.fp32_in" out_port="fp16_sum_of_2_mult.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.top_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.top_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.bot_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.bot_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.top_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.top_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.bot_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.bot_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="fp16_sum_of_2_mult.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp16_sum_of_2_mult.clk"/>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp16_sum_of_2_mult.mode_sigs">
+            </direct>
+            <direct name="atoa_top" input="dsp_pb.datain[15:0]" output="fp16_sum_of_2_mult.top_a">
+            </direct>
+            <direct name="btob_top" input="dsp_pb.datain[31:16]" output="fp16_sum_of_2_mult.top_b">
+            </direct>
+            <direct name="result_top" input="fp16_sum_of_2_mult.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="atoa_bot" input="dsp_pb.datain[47:32]" output="fp16_sum_of_2_mult.bot_a">
+            </direct>
+            <direct name="btob_bot" input="dsp_pb.datain[63:48]" output="fp16_sum_of_2_mult.bot_b">
+            </direct>
+            <direct name="result_bot" input="fp16_sum_of_2_mult.result" output="dsp_pb.result[63:32]">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]" output="fp16_sum_of_2_mult.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp16_sum_of_2_mult.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>  
+
+        <!-- floating point fp16 sum-of-2 mult mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b + fp32 chainin or third inp. chainout = third_inp or result)-->
+        <mode name="fp16_sum_of_products_2_mult_mode"> 
+          <pb_type name="fp16_sum_of_2_mult" blif_model=".subckt fp16_sop2_mult" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="top_a" num_pins="16"/>
+            <input name="top_b" num_pins="16"/>
+            <input name="bot_a" num_pins="16"/>
+            <input name="bot_b" num_pins="16"/>
+            <input name="fp32_in" num_pins="32"/>
+            <input name="chainin" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.reset" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.mode_sigs" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_a" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_b" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_a" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_b" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.chainin" out_port="fp16_sum_of_2_mult.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.fp32_in" out_port="fp16_sum_of_2_mult.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.reset" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.mode_sigs" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_a" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.top_b" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_a" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.bot_b" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.chainin" out_port="fp16_sum_of_2_mult.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_mult.fp32_in" out_port="fp16_sum_of_2_mult.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.top_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.top_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.bot_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.bot_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_mult.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.top_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.top_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.bot_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.bot_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_mult.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="fp16_sum_of_2_mult.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp16_sum_of_2_mult.clk"/>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp16_sum_of_2_mult.mode_sigs">
+            </direct>
+            <direct name="atoa_top" input="dsp_pb.datain[15:0]" output="fp16_sum_of_2_mult.top_a">
+            </direct>
+            <direct name="btob_top" input="dsp_pb.datain[31:16]" output="fp16_sum_of_2_mult.top_b">
+            </direct>
+            <direct name="result_top" input="fp16_sum_of_2_mult.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="atoa_bot" input="dsp_pb.datain[47:32]" output="fp16_sum_of_2_mult.bot_a">
+            </direct>
+            <direct name="btob_bot" input="dsp_pb.datain[63:48]" output="fp16_sum_of_2_mult.bot_b">
+            </direct>
+            <direct name="result_bot" input="fp16_sum_of_2_mult.result" output="dsp_pb.result[63:32]">
+            </direct>
+            <direct name="chainin" input="dsp_pb.chainin[31:0]" output="fp16_sum_of_2_mult.chainin">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]" output="fp16_sum_of_2_mult.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp16_sum_of_2_mult.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>        
+
+        <!-- floating point fp16 sum-of-2 accum mode (result = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b + accumulator. chainout = result)-->
+        <mode name="fp16_sum_of_products_2_accum_mode"> 
+          <pb_type name="fp16_sum_of_2_accum" blif_model=".subckt fp16_sop2_accum" num_pb="1">
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="reset" num_pins="1"/>
+            <input name="top_a" num_pins="16"/>
+            <input name="top_b" num_pins="16"/>
+            <input name="bot_a" num_pins="16"/>
+            <input name="bot_b" num_pins="16"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.mode_sigs" out_port="fp16_sum_of_2_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.top_a" out_port="fp16_sum_of_2_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.top_b" out_port="fp16_sum_of_2_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.bot_a" out_port="fp16_sum_of_2_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.bot_b" out_port="fp16_sum_of_2_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.reset" out_port="fp16_sum_of_2_accum.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.mode_sigs" out_port="fp16_sum_of_2_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.top_a" out_port="fp16_sum_of_2_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.top_b" out_port="fp16_sum_of_2_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.bot_a" out_port="fp16_sum_of_2_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.bot_b" out_port="fp16_sum_of_2_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_sum_of_2_accum.reset" out_port="fp16_sum_of_2_accum.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.top_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.top_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.bot_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.bot_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_sum_of_2_accum.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.top_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.top_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.bot_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.bot_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_sum_of_2_accum.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp16_sum_of_2_accum.mode_sigs"/>
+            <direct name="reset" input="dsp_pb.reset" output="fp16_sum_of_2_accum.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp16_sum_of_2_accum.clk"/>
+            <direct name="atoa_top" input="dsp_pb.datain[15:0]" output="fp16_sum_of_2_accum.top_a">
+            </direct>
+            <direct name="btob_top" input="dsp_pb.datain[31:16]" output="fp16_sum_of_2_accum.top_b">
+            </direct>
+            <direct name="result_top" input="fp16_sum_of_2_accum.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="atoa_bot" input="dsp_pb.datain[47:32]" output="fp16_sum_of_2_accum.bot_a">
+            </direct>
+            <direct name="btob_bot" input="dsp_pb.datain[63:48]" output="fp16_sum_of_2_accum.bot_b">
+            </direct>
+            <direct name="result_bot" input="fp16_sum_of_2_accum.result" output="dsp_pb.result[63:32]">
+            </direct>
+            <direct name="chainout" input="fp16_sum_of_2_accum.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>        
+
+        <!-- floating point fp16 mult, fp32 add mode (chainout = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b. result = chainin + third_inp)-->
+        <mode name="fp16_mult_fp32_add"> 
+          <pb_type name="fp16_mult_fp32_add" blif_model=".subckt fp16_mult_fp32_add" num_pb="1">
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="reset" num_pins="1"/>
+            <input name="top_a" num_pins="16"/>
+            <input name="top_b" num_pins="16"/>
+            <input name="bot_a" num_pins="16"/>
+            <input name="bot_b" num_pins="16"/>
+            <input name="fp32_in" num_pins="32"/>
+            <input name="chainin" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.reset" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.mode_sigs" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.top_a" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.top_b" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.bot_a" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.bot_b" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.chainin" out_port="fp16_mult_fp32_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.fp32_in" out_port="fp16_mult_fp32_add.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.reset" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.mode_sigs" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.top_a" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.top_b" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.bot_a" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.bot_b" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.chainin" out_port="fp16_mult_fp32_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_add.fp32_in" out_port="fp16_mult_fp32_add.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.top_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.top_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.bot_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.bot_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.top_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.top_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.bot_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.bot_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="clk" input="dsp_pb.clk" output="fp16_mult_fp32_add.clk"/>
+            <direct name="reset" input="dsp_pb.reset" output="fp16_mult_fp32_add.reset"/>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp16_mult_fp32_add.mode_sigs">
+            </direct>
+            <direct name="atoa_top" input="dsp_pb.datain[15:0]" output="fp16_mult_fp32_add.top_a">
+            </direct>
+            <direct name="btob_top" input="dsp_pb.datain[31:16]" output="fp16_mult_fp32_add.top_b">
+            </direct>
+            <direct name="result_top" input="fp16_mult_fp32_add.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="atoa_bot" input="dsp_pb.datain[47:32]" output="fp16_mult_fp32_add.bot_a">
+            </direct>
+            <direct name="btob_bot" input="dsp_pb.datain[63:48]" output="fp16_mult_fp32_add.bot_b">
+            </direct>
+            <direct name="result_bot" input="fp16_mult_fp32_add.result" output="dsp_pb.result[63:32]">
+            </direct>
+            <direct name="chainin" input="dsp_pb.chainin[31:0]" output="fp16_mult_fp32_add.chainin">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]]" output="fp16_mult_fp32_add.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp16_mult_fp32_add.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>      
+
+        <!-- floating point fp16 mult, fp32 accum mode (chainout = fp16_mult_top_a * fp16_mult_top_b + fp16_mult_bot_a * fp16_mult_bot_b. result = third_inp + accumulator)-->
+        <mode name="fp16_mult_fp32_accum"> 
+          <pb_type name="fp16_mult_fp32_accum" blif_model=".subckt fp16_mult_fp32_accum" num_pb="1">
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="reset" num_pins="1"/>
+            <input name="top_a" num_pins="16"/>
+            <input name="top_b" num_pins="16"/>
+            <input name="bot_a" num_pins="16"/>
+            <input name="bot_b" num_pins="16"/>
+            <input name="fp32_in" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.mode_sigs" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.top_a" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.top_b" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.bot_a" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.bot_b" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.fp32_in" out_port="fp16_mult_fp32_accum.result"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.reset" out_port="fp16_mult_fp32_accum.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.mode_sigs" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.top_a" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.top_b" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.bot_a" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.bot_b" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.fp32_in" out_port="fp16_mult_fp32_accum.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp16_mult_fp32_accum.reset" out_port="fp16_mult_fp32_accum.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.top_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.top_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.bot_a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.bot_b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp16_mult_fp32_accum.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.top_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.top_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.bot_a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.bot_b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp16_mult_fp32_accum.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp16_mult_fp32_accum.mode_sigs"/>
+            <direct name="reset" input="dsp_pb.reset" output="fp16_mult_fp32_accum.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp16_mult_fp32_accum.clk"/>
+            <direct name="atoa_top" input="dsp_pb.datain[15:0]" output="fp16_mult_fp32_accum.top_a">
+            </direct>
+            <direct name="btob_top" input="dsp_pb.datain[31:16]" output="fp16_mult_fp32_accum.top_b">
+            </direct>
+            <direct name="result_top" input="fp16_mult_fp32_accum.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="atoa_bot" input="dsp_pb.datain[47:32]" output="fp16_mult_fp32_accum.bot_a">
+            </direct>
+            <direct name="btob_bot" input="dsp_pb.datain[63:48]" output="fp16_mult_fp32_accum.bot_b">
+            </direct>
+            <direct name="result_bot" input="fp16_mult_fp32_accum.result" output="dsp_pb.result[63:32]">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]" output="fp16_mult_fp32_accum.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp16_mult_fp32_accum.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>      
+
+        <!-- floating point fp32 mult_then_add mode (result = fp32_mult_a * fp32_mult_b + chainin. chainout = third_inp or result) -->
+        <mode name="fp32_mult_then_add"> 
+          <pb_type name="fp32_mult_then_add" blif_model=".subckt fp32_mult_then_add" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <input name="fp32_in" num_pins="32"/>
+            <input name="chainin" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.reset" out_port="fp32_mult_then_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.mode_sigs" out_port="fp32_mult_then_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.a" out_port="fp32_mult_then_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.b" out_port="fp32_mult_then_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.chainin" out_port="fp32_mult_then_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.fp32_in" out_port="fp32_mult_then_add.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.reset" out_port="fp32_mult_then_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.mode_sigs" out_port="fp32_mult_then_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.a" out_port="fp32_mult_then_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.b" out_port="fp32_mult_then_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.chainin" out_port="fp32_mult_then_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_then_add.fp32_in" out_port="fp32_mult_then_add.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_then_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_then_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="fp32_mult_then_add.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp32_mult_then_add.clk"/>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp32_mult_then_add.mode_sigs">
+            </direct>
+            <direct name="atoa" input="dsp_pb.datain[31:0]" output="fp32_mult_then_add.a">
+            </direct>
+            <direct name="btob" input="dsp_pb.datain[63:32]" output="fp32_mult_then_add.b">
+            </direct>
+            <direct name="result" input="fp32_mult_then_add.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="chainin" input="dsp_pb.chainin[31:0]" output="fp32_mult_then_add.chainin">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]]" output="fp32_mult_then_add.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp32_mult_then_add.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>      
+
+        <!-- floating point fp32 mult_add mode (chainout = fp32_mult_a * fp32_mult_b. chainout = third_inp + chainin)-->
+        <mode name="fp32_mult_add"> 
+          <pb_type name="fp32_mult_add" blif_model=".subckt fp32_mult_add" num_pb="1">
+            <input name="reset" num_pins="1"/>
+            <input name="mode_sigs" num_pins="12"/>
+            <input name="a" num_pins="32"/>
+            <input name="b" num_pins="32"/>
+            <input name="fp32_in" num_pins="32"/>
+            <input name="chainin" num_pins="32"/>
+            <output name="result" num_pins="32"/>
+            <output name="chainout" num_pins="32"/>
+            <clock name="clk" num_pins="1"/>
+
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.reset" out_port="fp32_mult_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.mode_sigs" out_port="fp32_mult_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.a" out_port="fp32_mult_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.b" out_port="fp32_mult_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.chainin" out_port="fp32_mult_add.result"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.fp32_in" out_port="fp32_mult_add.result"/>
+
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.reset" out_port="fp32_mult_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.mode_sigs" out_port="fp32_mult_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.a" out_port="fp32_mult_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.b" out_port="fp32_mult_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.chainin" out_port="fp32_mult_add.chainout"/>
+            <delay_constant max="2.56e-9" in_port="fp32_mult_add.fp32_in" out_port="fp32_mult_add.chainout"/>
+
+            <T_setup value="18.91e-12" port="fp32_mult_add.reset" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.mode_sigs" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.a" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.b" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.chainin" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.fp32_in" clock="clk"/>
+            <T_setup value="18.91e-12" port="fp32_mult_add.result" clock="clk"/>
+
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.reset" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.mode_sigs" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.a" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.b" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.chainin" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.fp32_in" clock="clk"/>
+            <T_clock_to_Q max="60.32e-12" min="60.32e-12" port="fp32_mult_add.result" clock="clk"/>
+          </pb_type>
+          <interconnect>
+            <direct name="reset" input="dsp_pb.reset" output="fp32_mult_add.reset"/>
+            <direct name="clk" input="dsp_pb.clk" output="fp32_mult_add.clk"/>
+            <direct name="mode_sigs" input="dsp_pb.mode_sigs" output="fp32_mult_add.mode_sigs">
+            </direct>
+            <direct name="atoa" input="dsp_pb.datain[31:0]" output="fp32_mult_add.a">
+            </direct>
+            <direct name="btob" input="dsp_pb.datain[63:32]" output="fp32_mult_add.b">
+            </direct>
+            <direct name="result" input="fp32_mult_add.result" output="dsp_pb.result[31:0]">
+            </direct>
+            <direct name="chainin" input="dsp_pb.chainin[31:0]" output="fp32_mult_add.chainin">
+            </direct>
+            <direct name="fp32in" input="dsp_pb.datain[95:64]]" output="fp32_mult_add.fp32_in">
+            </direct>
+            <direct name="chainout" input="fp32_mult_add.chainout" output="dsp_pb.chainout[31:0]">
+            </direct>
+          </interconnect>
+        </mode>      
+      </pb_type>
+
+      <interconnect>
+        <direct name="mode_sigs" input="dsp.dsp_I1[11:0]" output ="dsp_pb.mode_sigs"/>
+        <direct name="datain1" input="dsp.dsp_I1[63:12]" output ="dsp_pb.datain[51:0]"/>
+        <direct name="datain2" input="dsp.dsp_I2" output ="dsp_pb.datain[115:52]"/>
+        <direct name="reset" input="dsp.reset" output="dsp_pb.reset"></direct>
+        <direct name="chainin" input="dsp.chainin"    output="dsp_pb.chainin"></direct>
+        <direct name="chainout" input="dsp_pb.chainout" output="dsp.chainout"></direct>
+        <direct name="scanin" input="dsp.scanin"    output="dsp_pb.scanin"></direct>
+        <direct name="scanout" input="dsp_pb.scanout" output="dsp.scanout"></direct>
+        <direct name="result" input="dsp_pb.result" output="dsp.result"></direct>
+        <direct name="clk" input="dsp.clk" output="dsp_pb.clk"></direct>
+      </interconnect>  
+
+    </pb_type>
+    <interconnect>
+        <!--50% sparse crossbar means 50% of the lines can reach an actual input of the dsp 
+        We do this by splitting inputs into two buckets and having two full crossbars-->
+        <!--
+       <complete name="first_half" input="dsp_top.dsp_I1" output="dsp.dsp_I1">
+            <delay_constant max="333e-12" in_port="dsp_top.dsp_I1" out_port="dsp.dsp_I1"/>
+       </complete>
+
+        <complete name="second_half" input="dsp_top.dsp_I2" output="dsp.dsp_I2">
+            <delay_constant max="333e-12" in_port="dsp_top.dsp_I2" out_port="dsp.dsp_I2"/>
+        </complete>
+        -->
+        <direct name="enable" input="dsp_top.dsp_I1[0]" output ="dsp.dsp_I1[0]"/>
+        <direct name="loadconst" input="dsp_top.dsp_I1[1]" output ="dsp.dsp_I1[1]"/>
+        <direct name="accumulate" input="dsp_top.dsp_I1[2]" output ="dsp.dsp_I1[2]"/>
+        <direct name="negate" input="dsp_top.dsp_I1[3]" output ="dsp.dsp_I1[3]"/>
+        <direct name="sub" input="dsp_top.dsp_I1[4]" output ="dsp.dsp_I1[4]"/>
+        <direct name="mode" input="dsp_top.dsp_I1[7:5]" output ="dsp.dsp_I1[7:5]"/>
+        <direct name="mux9_select" input="dsp_top.dsp_I1[8]" output ="dsp.dsp_I1[8]"/>
+        <direct name="internal_coeffa" input="dsp_top.dsp_I1[9]" output ="dsp.dsp_I1[9]"/>
+        <direct name="internal_coeffb" input="dsp_top.dsp_I1[10]" output ="dsp.dsp_I1[10]"/>
+        <direct name="datain1" input="dsp_top.dsp_I1[63:11]" output ="dsp.dsp_I1[63:11]"/>
+        <direct name="datain2" input="dsp_top.dsp_I2" output ="dsp.dsp_I2"/>
+ 
+        <direct name="reset" input="dsp_top.reset" output="dsp.reset"></direct>
+        <direct name="chainin" input="dsp_top.chainin" output="dsp.chainin">
+            <delay_constant max="1179e-12" in_port="dsp_top.chainin" out_port="dsp.chainin"/>
+        </direct>
+        <direct name="chainout" input="dsp.chainout" output="dsp_top.chainout">
+            <delay_constant max="1179e-12" in_port="dsp.chainout" out_port="dsp_top.chainout"/>
+        </direct>
+        <direct name="scanin" input="dsp_top.scanin" output="dsp.scanin">
+            <delay_constant max="1179e-12" in_port="dsp_top.scanin" out_port="dsp.scanin"/>
+        </direct>
+        <direct name="scanout" input="dsp.scanout" output="dsp_top.scanout">
+            <delay_constant max="1179e-12" in_port="dsp.scanout" out_port="dsp_top.scanout"/>
+        </direct>
+        <direct name="result" input="dsp.result" output="dsp_top.result"></direct>
+        <direct name="clk" input="dsp_top.clk" output="dsp.clk"></direct>
+    </interconnect>
+    </pb_type>
+    <!-- Define DSP slice end -->
+
+
+    <!-- Define fracturable memory begin -->
+    <!-- 
+    RAM blocks always have registered inputs. The input FFs appear before the address decoder & wordline driver,
+    and after the local input crossbar & level shifter.
+    RAM blocks optionally have registered outputs. The output FFs (if present) appear after the output crossbar.
+    If BRAM doesn't have registered outputs, then T_clk_to_q is the whole delay of the read/write operation.
+    If BRAM does have registered output, then T_clk_to_q is just the FF clk_to_q and then delay_constant
+    can be used to specify the whole delay of the read/write operation.
+
+    This RAM block has registered outputs.
+
+    The area and delay values of this RAM block were obtained (indirectly) from COFFE simulations.
+    COFFE only support widths and depths that are powers of 2. For M20K (20 Kilobit BRAM), we need
+    the width to be 40 bits and depth to be 512 (for the logically widest mode: 512x40). We can't
+    simulate these dimensions directly in COFFE. So, we simulated and obtained the results for M32K
+    (32 Kilobits BRAM) and (16 Kilobits BRAM). Then we interpolated the results.
+    For delay, a linear interpolation was used, based on the size of the Memory (16K->20K->32K).
+    For area, the value was calculated using two interpolations: (1) port based (change in number of 
+    ports in going from 16K->20K->32K) and (2) number of bits based (change in number of bits in
+    going from 16K->20K->32K). The interpolation that resulted in the larger area was picked.
+    
+
+    Here are the equations used to calculate the delays based on COFFE results:
+    T_setup (inputs) = T_level_shifter + T_register_micro_setup = 32.3ps + 18.91ps = 51.21ps
+    T_clk_to_q (inputs) = T_register_micro_clk_to_q = 60.32ps
+    T_setup (outputs) = T_register_micro_setup = 18.91ps 
+    T_clk_to_q (outputs) = T_register_micro_clk_to_q = 60.32ps
+
+    (Register setup and clk_to_q timings are actually from the FF used in the logic cluster.)
+
+    T_read = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver + Bit line delay) + (Sense amp + Output crossbar)
+
+    * Bit line delay is included in self.RAM.samp.delay time in COFFE. The Sense amp delay is actually
+    self.RAM.samp_part2.delay
+
+    T_write = T1 + T2 + T3
+    = max (Row decoder, Pre-charge time) + (Wordline driver) + (Write driver)
+
+    delay_constant values model the internal limits of a block (the combinatorial delay).
+    delay_constant = max (T_read, T_write) 
+
+	  Overall internal delay of the RAM is T_clk_to_q (inputs) + delay_constant + T_setup (outputs)
+    -->
+    <pb_type name="memory">
+      <input name="addr1" num_pins="11"/>
+      <input name="addr2" num_pins="11"/>
+      <input name="data" num_pins="40"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="40"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Specify single port mode first -->
+      <mode name="mem_512x40_sp">
+        <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="9" port_class="address"/>
+          <input name="data" num_pins="40" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="40" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="51.12e-12" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_512x40_sp.data" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_512x40_sp.we" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_512x40_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="60.32e-12" port="mem_512x40_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_512x40_sp.data" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_512x40_sp.we" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_512x40_sp.out" clock="clk"/>
+
+          <delay_constant max="0.852e-9" in_port="mem_512x40_sp.addr" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_512x40_sp.data" out_port="mem_512x40_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_512x40_sp.we"   out_port="mem_512x40_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_1024x20_sp">
+        <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="10" port_class="address"/>
+          <input name="data" num_pins="20" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="20" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="51.12e-12" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_sp.data" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_sp.we" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_sp.out" clock="clk"/>
+
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_sp.addr" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_sp.data" out_port="mem_1024x20_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_sp.we"   out_port="mem_1024x20_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_sp">
+        <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="11" port_class="address"/>
+          <input name="data" num_pins="10" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="10" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="51.12e-12" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_sp.addr" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_sp.data" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_sp.we" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_sp.out" clock="clk"/>
+
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_sp.addr" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_sp.data" out_port="mem_2048x10_sp.out"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_sp.we"   out_port="mem_2048x10_sp.out"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <!-- Specify true dual port mode next -->
+      <mode name="mem_1024x20_dp">
+        <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="10" port_class="address1"/>
+          <input name="addr2" num_pins="10" port_class="address2"/>
+          <input name="data1" num_pins="20" port_class="data_in1"/>
+          <input name="data2" num_pins="20" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="20" port_class="data_out1"/>
+          <output name="out2" num_pins="20" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_1024x20_dp.out2" clock="clk"/>
+
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.addr1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.data1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.we1" out_port="mem_1024x20_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.addr2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.data2" out_port="mem_1024x20_dp.out2"/>
+          <delay_constant max="0.852e-9" in_port="mem_1024x20_dp.we2" out_port="mem_1024x20_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+          </direct>
+          <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+
+      <mode name="mem_2048x10_dp">
+        <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="11" port_class="address1"/>
+          <input name="addr2" num_pins="11" port_class="address2"/>
+          <input name="data1" num_pins="10" port_class="data_in1"/>
+          <input name="data2" num_pins="10" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="10" port_class="data_out1"/>
+          <output name="out2" num_pins="10" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_setup value="51.12e-12" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_setup value="18.91e-12" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.addr1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.data1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.we1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.addr2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.data2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="60.32e-12" port="mem_2048x10_dp.out2" clock="clk"/>
+
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.addr1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.data1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.we1" out_port="mem_2048x10_dp.out1"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.addr2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.data2" out_port="mem_2048x10_dp.out2"/>
+          <delay_constant max="0.852e-9" in_port="mem_2048x10_dp.we2" out_port="mem_2048x10_dp.out2"/>
+
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+          </direct>
+          <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+          </direct>
+          <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+          </direct>
+          <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+          </direct>
+          <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+          </direct>
+          <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+    </pb_type>
+    <!-- Define fracturable memory end -->
+  </complexblocklist>
+
+  <switchblocklist>
+    <!-- Stratix IV uses a uni-directional routing architecture with a Driver Input Mux (DIM) size of 12 (i.e.
+           each wire can be driven by one of 12 block/outputs or wires) for the L4s.
+           
+           In the Stratix IV architecture the long wires (L16 here) are accessible only from the short wires, 
+           and are not connected to the block pins (i.e. connection blocks). Furthermore, they only connect 
+           to switch blocks every 4 LABs (to avoid expensive deep via stacks).
+           We approximate the L16 DIM size as 40:1 (in reality it is a pair of 20:1 (?) muxes with a 2:1 swap mux
+           in front, which has nearly the same connectivity as a full 40:1).
+
+           L4 wires
+           ================
+           At a channel width of 300 there are 260 L4/L4prime wires. At an effective Fc_out of 0.075 
+           and 40 LAB outputs this yeilds:
+
+                40 * 2 = 80 outputs per channel  [2 LABs per-channel]
+
+                80 * 0.075 = 6 outputs drive each L4 wire [output connection block]
+
+           This leaves:
+
+                12 - 6 = 6 inputs to the DIMs from other routing wires [switch block]
+
+           Since L4s connect at every switch block, there are:
+
+                260 L16 wires per channel + direction which can drive wires at a particular switchblock
+                (via switchpoints 0, 1, 2, 3)
+
+           And for each direction (260 wires) only:
+
+               260 / 4 = 65 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there are 32 L4s starting, and 32 L4s ending; + 1 wire for the 65th)
+
+           Which we allocate as follows:
+
+                L4
+                =====
+                straight-through connection: 2 (from L4 or L16)
+                clock-wise turn            : 2 (from L4 or L16)
+                counter-clock-wise turn    : 2 (from L4 or L16)
+
+           L16 wires
+           =========
+           At a channel width of 300 there are 40 L16 wires (20 in each direction), which do not connect to the input/output connection blocks.
+           This leaves 40 inputs to the DIM to select from routing wires (long wires use larger DIMs to improve reachability,
+           the area cost is relatively small since they are so rare).
+
+           Since L16s only connect at every 4th switch block there are:
+
+                40 / 4 = 10 L16 wires per channel (5 in each direction) which can drive wires at a particular switchblock
+                (via switchpoints 0, 4, 8, 12)
+
+           And for each direction (20 wires) only:
+
+               40 / 16 = 2.25 => 2 wires starting/ending per channel + direction at each switch block
+               (i.e. from each direction, north/south/east/west, there is one L16 starting, and one L16 ending)
+           
+           We assign the 40 DIM inputs as follows:
+
+                L16
+                =====
+                straight-through connection:  3 (from L16)
+                straight-through connection: 11 (from L4)
+                clock-wise turn            :  3 (from L16)
+                clock-wise turn            : 10 (from L4)
+                counter clock-wise turn    :  3 (from L16)
+                counter clock-wise turn    : 10 (from L4)
+
+           Switch pattern
+           ==============
+           This switch block is based on the Wilton switch block (see Page 103 of Steve Wilton's PhD Thesis 
+           "Architecture and Algorithms for Field-Programmable Gate Arrays with Embedded Memory", 1997):
+
+                left-to-top: W - t
+                top-to-right: t + 1
+                right-to-bottom: 2*W - 2 - t
+                bottom-to-left: t + 1
+                left-to-right: t
+                top-to-bottom: t
+
+           Since Wilton assumed bidirection routing (while we use unidirectional routing),
+           we mirror the clock-wise turns to match the conter-clock-wise specification.
+           -->
+    <switchblock name="wilton_turn_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_turn_counter_clockwise_core" type="unidir">
+      <switchblock_location type="CORE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+      </switchfuncs>
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="10*to" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers 
+
+               Driving from L16 (few) to L4 (many) preferr driving from end-point of L16, although since there are many they will
+               all be multiply connected.
+               
+               Driving from L4 (many) to L4 (many) shuffle the switchpoints so the L4's are driven from a variety of switchpoints.
+               Since the actual number L4s starting/ending are equal, using 'fixed' from_order would mean only switchpoint 0 -> 0
+               connections. A 'shuffled' order will mix-up the from switchpoints for more diversity.
+
+               Note that a different from_switchpoints ordering is used to ensure a different shuffling occurs compared to 
+               wilton_turn_clockwise_core.
+               -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0,1,2,3"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+    </switchblock>
+    <switchblock name="wilton_straight" type="unidir">
+      <switchblock_location type="EVERYWHERE"/>
+      <switchfuncs>
+        <!-- Straight -->
+        <func type="lr" formula="t"/>
+        <!-- left to right -->
+        <func type="tb" formula="t"/>
+        <!-- top to bottom -->
+        <func type="rl" formula="t"/>
+        <!-- right to left -->
+        <func type="bt" formula="t"/>
+        <!-- bottom to top -->
+      </switchfuncs>
+      <!-- L16 Drivers 
+                Note that we order the switchpoints in order of preference, since VPR currently
+                iterates through the source sets in order, such that we connect first to wires
+                ending at the switchblock (switchpoint 0), and then fallback to switchpoints
+                in decreasing distance from the drive point (if we have more to's than from's
+                it then wraps around).
+
+                Note also that we multiply the number of expected connections by 'to', since while usually
+                there is only one 'to' wire, ocasionally there may be more, and we want to ensure they all
+                get the same number of connections.
+
+                For L16->L16:
+                  We allow any valid switchpoint to be used as the 'from' point.
+                  Allow 'low' switchpoints like '4' may seem counter-intuitive (i.e. why not use a cheaper L4)
+                  this makes it easier to bypass once on the L16 network (e.g. to get around congestion).
+           -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_straight_corner" type="unidir">
+      <!-- Same as wilton straight, but turning around a corner -->
+      <switchblock_location type="CORNER"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t"/>
+        <!-- top to right -->
+        <func type="rb" formula="t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t"/>
+        <!-- right to top -->
+        <func type="br" formula="t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- L16 Drivers -->
+      <wireconn num_conns="3*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="11*to" from_type="L4" from_switchpoint="0,3,2,1" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 Drivers -->
+      <wireconn num_conns="2*to" from_order="shuffled">
+        <from type="L16" switchpoint="0,12,8,4"/>
+        <from type="L4" switchpoint="0"/>
+        <to type="L4" switchpoint="0"/>
+      </wireconn>
+      <!--<wireconn num_conns="1*to" from_type="L4" from_switchpoint="0" to_type="L4" to_switchpoint="0"/>-->
+      <!--<wireconn num_conns="1*to" from_type="L16" from_switchpoint="0,12,8,4" to_type="L4" to_switchpoint="0"/>-->
+    </switchblock>
+    <switchblock name="wilton_turn_fringe" type="unidir">
+      <!-- Non-corner perimeter SBs -->
+      <switchblock_location type="FRINGE"/>
+      <switchfuncs>
+        <!-- Counter-clock-wise turns -->
+        <func type="lt" formula="W-t"/>
+        <!-- left to top -->
+        <func type="tr" formula="t+1"/>
+        <!-- top to right -->
+        <func type="rb" formula="2*W-2-t"/>
+        <!-- right to bottom -->
+        <func type="bl" formula="t+1"/>
+        <!-- bottom to left -->
+        <!-- Clock-wise turns -->
+        <func type="tl" formula="W-t"/>
+        <!-- top to left -->
+        <func type="rt" formula="t+1"/>
+        <!-- right to top -->
+        <func type="br" formula="2*W-2-t"/>
+        <!-- bottom to right -->
+        <func type="lb" formula="t+1"/>
+        <!-- left to bottom -->
+      </switchfuncs>
+      <!-- We use 'max' style connections here to ensure there are no dangling wires, otherwise like core turns -->
+      <!-- L16 drivers -->
+      <wireconn num_conns="3*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" to_type="L16" to_switchpoint="0"/>
+      <wireconn num_conns="21*max(from,to)" from_type="L4" from_switchpoint="0" to_type="L16" to_switchpoint="0"/>
+      <!-- L4 drivers -->
+      <wireconn num_conns="1*max(from,to)" from_type="L16" from_switchpoint="0,12,8,4" from_order="fixed" to_type="L4" to_switchpoint="0"/>
+      <wireconn num_conns="1*max(from,to)" from_type="L4" from_switchpoint="0,1,2,3" from_order="shuffled" to_type="L4" to_switchpoint="0"/>
+    </switchblock>
+  </switchblocklist>
+
+  <clocks>
+    <clock buffer_size="auto" C_wire="2.5e-10"/>
+  </clocks>
+</architecture>
+
+
diff --git a/third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml b/third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
new file mode 100644
index 000000000..b8d26348a
--- /dev/null
+++ b/third_party/vtr_flow/arch/k6_frac_N10_frac_chain_mem32K_40nm.xml
@@ -0,0 +1,1505 @@
+<!-- 
+  Flagship Heterogeneous Architecture with Carry Chains for VTR 7.0.
+
+  - 40 nm technology
+  - General purpose logic block: 
+    K = 6, N = 10, fracturable 6 LUTs (can operate as one 6-LUT or two 5-LUTs with all 5 inputs shared) 
+    with optionally registered outputs
+    Each 5-LUT has an arithemtic mode that converts it to a single-bit adder with both inputs driven by 4-LUTs (both 4-LUTs share all 4 inputs)
+    Carry chain links to vertically adjacent logic blocks
+  - Memory size 32 Kbits, memory aspect ratios vary from a data width of 1 to data width of 64.  
+    Height = 6, found on every (8n+2)th column
+  - Multiplier modes: one 36x36, two 18x18, each 18x18 can also operate as two 9x9.  
+    Height = 4, found on every (8n+6)th column
+  - Routing architecture: L = 4, fc_in = 0.15, Fc_out = 0.1
+
+  Details on Modelling:
+
+  The electrical design of the architecture described here is NOT from an 
+  optimized, SPICED architecture.  Instead, we attempt to create a reasonable 
+  architecture file by using an existing commercial FPGA to approximate the area, 
+  delay, and power of the underlying components. This is combined with a reasonable 40 nm 
+  model of wiring and circuit design for low-level routing components, where available.
+  The resulting architecture has delays that roughly match a commercial 40 nm FPGA, but also 
+  has wiring electrical parameters that allow the wire lengths and switch patterns to be 
+  modified and you will still get reasonable delay results for the new architecture.
+  The following describes, in detail, how we obtained the various electrical values for this 
+  architecture.
+
+  Rmin for nmos and pmos, routing buffer sizes, and I/O pad delays are from the ifar 
+  architecture created by Ian Kuon: K06 N10 45nm fc 0.15 area-delay optimized architecture. 
+  (n10k06l04.fc15.area1delay1.cmos45nm.bptm.cmos45nm.xml)      
+  This routing architecture was optimized for 45 nm, and we have scaled it linearly to 40 nm to 
+  match the overall target (a 40 nm FPGA).
+
+  We obtain maximum delay numbers by measuring delays of routing, soft logic blocks, 
+  memories, and multipliers from test circuits on a Stratix IV GX device 
+  (EP4SGX230DF29C2X, i.e. fastest speed grade). Minimum delay values are calculated based on the
+  ratios between maximum and minimum values in Stratix IV GX device. For routing, we took the 
+  average delay of H4 and V4 wires.  Rmetal and Cmetal values for the routing wires were obtained 
+  from work done by Charles Chiasson. We use a 96 nm half-pitch (corresponding to mid-level metal 
+  stack 40 nm routing) and take the R and C data from the ITRS roadmap. 
+
+  For the general purpose logic block, we assume that the area and delays of the Stratix IV 
+  crossbar is close enough to the crossbar modelled here.  We use 40 inputs and 20 feedback lines in 
+  the cluster and a full crossbar, leading to 53:1 multiplexers in front of each BLE input.
+  Stratix IV uses 52 inputs and 20 feedback lines, but only a half-populated crossbar, leading to 
+  36:1 multiplexers.  We require 60 such multiplexers, while Stratix IV requires 88 for its more
+  complex fracturable BLEs + the extra control signals. We justify this rough approximation as follows: 
+  The Stratix IV crossbar has more inputs (72 vs. 60) and 
+  outputs (88 vs. 60) than our full crossbar which should increase its area and delay, but the 
+  Stratix IV crossbar is also 50% sparse (each mux is 36:1 instead of 53:1) which should reduce its 
+  area and delay.  The total number of crossbar switch points is roughly similar between the two 
+  architectures (3160 for SIV and 3600 for the academic architecture below), so we use the area 
+  & delay of the Stratix IV crossbar as a rough approximation of our crossbar.
+
+  For LUTs, we include LUT 
+  delays measured from Stratix IV which is dependant on the input used (ie. some 
+  LUT inputs are faster than others).  The CAD tools at the time of VTR 7 does 
+  not consider differences in LUT input delays.
+
+  Adder delays obtained as approximate values from a Stratix IV EP4SE230F29C3 device.  
+  Delay obtained by compiling a 256 bit adder (registered inputs and outputs, 
+  all pins except clock virtual) then measuring the delays in chip-planner, 
+  sumout delay = 0.271ns to 0.348 ns, intra-block carry delay = 0.011 ns, 
+  inter-block carry delay = 0.327 ns.  Given this data, I will approximate 
+  sumout 0.3 ns, intra-block carry-delay = 0.01 ns, and 
+  inter-block carry-delay = 0.16 ns (since Altera inter-block carry delay has 
+  overhead that we don't have, I'll approximate the delay of a simpler chain at 
+  one half what they have.  This is very rough, anything from 0.01ns to 0.327ns 
+  can be justified).
+
+  Logic block area numbers obtained by scaling overall tile area of a 65nm 
+  Stratix III device, (as given in Wong, Betz and Rose, FPGA 2011) to 40 nm, then subtracting out 
+  routing area at a channel width of 300. We use a channel width of 300 because it can route 
+  all the VTR 6.0 benchmark circuits with an approximately 20% safety margin, and is also close to the
+  total channel width of Stratix IV. Hence this channel width is close to the commercial practice of
+  choosing a width that provides high routability. The architecture can be routed at different channel
+  widths, but we estimate the tile size and hence the physical length of routing wires assuming
+  a channel width of 300.
+
+  Sanity checks employed:
+    1.  We confirmed the routing buffer delay is ~1/3rd of total routing delay at L = 4. This matches 
+        common electrical design.
+
+
+  Authors: Jason Luu, Jeff Goeders, Vaughn Betz
+-->
+<architecture>
+  <!-- 
+       ODIN II specific config begins 
+       Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+       ".model [type_of_block]") that this architecture supports.
+
+       Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+       already special structures in blif (.names, .input, .output, and .latch) 
+       that describe them.
+  -->
+  <models>
+    <model name="multiply">
+      <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+      </input_ports>
+      <output_ports>
+        <port name="out"/>
+      </output_ports>
+    </model>
+    <model name="single_port_ram">
+      <input_ports>
+        <port name="we" clock="clk"/>
+        <!-- control -->
+        <port name="addr" clock="clk"/>
+        <!-- address lines -->
+        <port name="data" clock="clk"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <model name="dual_port_ram">
+      <input_ports>
+        <port name="we1" clock="clk"/>
+        <!-- write enable -->
+        <port name="we2" clock="clk"/>
+        <!-- write enable -->
+        <port name="addr1" clock="clk"/>
+        <!-- address lines -->
+        <port name="addr2" clock="clk"/>
+        <!-- address lines -->
+        <port name="data1" clock="clk"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk"/>
+        <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>
+        <!-- memories are often clocked -->
+      </input_ports>
+      <output_ports>
+        <port name="out1" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>
+        <!-- output can be broken down into smaller bit widths minimum size 1 -->
+      </output_ports>
+    </model>
+    <model name="adder">
+      <input_ports>
+        <port name="a" combinational_sink_ports="sumout cout"/>
+        <port name="b" combinational_sink_ports="sumout cout"/>
+        <port name="cin" combinational_sink_ports="sumout cout"/>
+      </input_ports>
+      <output_ports>
+        <port name="cout"/>
+        <port name="sumout"/>
+      </output_ports>
+    </model>
+  </models>
+  <tiles>
+    <tile name="io" area="0">
+      <sub_tile name="io" capacity="8">
+        <equivalent_sites>
+          <site pb_type="io" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="custom">
+          <loc side="left">io.outpad io.inpad io.clock</loc>
+          <loc side="top">io.outpad io.inpad io.clock</loc>
+          <loc side="right">io.outpad io.inpad io.clock</loc>
+          <loc side="bottom">io.outpad io.inpad io.clock</loc>
+        </pinlocations>
+      </sub_tile>
+    </tile>
+    <tile name="clb" area="53894">
+      <sub_tile name="clb">
+        <equivalent_sites>
+          <site pb_type="clb" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="I" num_pins="40" equivalent="full"/>
+        <input name="cin" num_pins="1"/>
+        <output name="O" num_pins="20" equivalent="none"/>
+        <output name="cout" num_pins="1"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+          <fc_override port_name="cin" fc_type="frac" fc_val="0"/>
+          <fc_override port_name="cout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="spread"/>
+      </sub_tile>
+    </tile>
+    <tile name="mult_36" height="4" area="396000">
+      <sub_tile name="mult_36">
+        <equivalent_sites>
+          <site pb_type="mult_36" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="a" num_pins="36"/>
+        <input name="b" num_pins="36"/>
+        <output name="out" num_pins="72"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="spread"/>
+      </sub_tile>
+    </tile>
+    <tile name="memory" height="6" area="548000">
+      <sub_tile name="memory">
+        <equivalent_sites>
+          <site pb_type="memory" pin_mapping="direct"/>
+        </equivalent_sites>
+        <input name="addr1" num_pins="15"/>
+        <input name="addr2" num_pins="15"/>
+        <input name="data" num_pins="64"/>
+        <input name="we1" num_pins="1"/>
+        <input name="we2" num_pins="1"/>
+        <output name="out" num_pins="64"/>
+        <clock name="clk" num_pins="1"/>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+        <pinlocations pattern="spread"/>
+      </sub_tile>
+    </tile>
+  </tiles>
+  <!-- ODIN II specific config ends -->
+  <!-- Physical descriptions begin -->
+  <layout>
+    <auto_layout aspect_ratio="1.0">
+      <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+      <perimeter type="io" priority="100"/>
+      <corners type="EMPTY" priority="101"/>
+      <!--Fill with 'clb'-->
+      <fill type="clb" priority="10"/>
+      <!--Column of 'mult_36' with 'EMPTY' blocks wherever a 'mult_36' does not fit. Vertical offset by 1 for perimeter.-->
+      <col type="mult_36" startx="6" starty="1" repeatx="8" priority="20"/>
+      <col type="EMPTY" startx="6" repeatx="8" starty="1" priority="19"/>
+      <!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
+      <col type="memory" startx="2" starty="1" repeatx="8" priority="20"/>
+      <col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/>
+    </auto_layout>
+  </layout>
+  <device>
+    <!-- VB & JL: Using Ian Kuon's transistor sizing and drive strength data for routing, at 40 nm. Ian used BPTM 
+			     models. We are modifying the delay values however, to include metal C and R, which allows more architecture
+			     experimentation. We are also modifying the relative resistance of PMOS to be 1.8x that of NMOS
+			     (vs. Ian's 3x) as 1.8x lines up with Jeff G's data from a 45 nm process (and is more typical of 
+			     45 nm in general). I'm upping the Rmin_nmos from Ian's just over 6k to nearly 9k, and dropping 
+			     RminW_pmos from 18k to 16k to hit this 1.8x ratio, while keeping the delays of buffers approximately
+			     lined up with Stratix IV. 
+			     We are using Jeff G.'s capacitance data for 45 nm (in tech/ptm_45nm).
+			     Jeff's tables list C in for transistors with widths in multiples of the minimum feature size (45 nm).
+			     The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply drive strength sizes in this file
+	                     by 2.5x when looking up in Jeff's tables.
+			     The delay values are lined up with Stratix IV, which has an architecture similar to this
+			     proposed FPGA, and which is also 40 nm 
+			     C_ipin_cblock: input capacitance of a track buffer, which VPR assumes is a single-stage
+			     4x minimum drive strength buffer. -->
+    <sizing R_minW_nmos="8926" R_minW_pmos="16067"/>
+    <!-- The grid_logic_tile_area below will be used for all blocks that do not explicitly set their own (non-routing)
+     	  area; set to 0 since we explicitly set the area of all blocks currently in this architecture file.
+	    -->
+    <area grid_logic_tile_area="0"/>
+    <chan_width_distr>
+      <x distr="uniform" peak="1.000000"/>
+      <y distr="uniform" peak="1.000000"/>
+    </chan_width_distr>
+    <switch_block type="wilton" fs="3"/>
+    <connection_block input_switch_name="ipin_cblock"/>
+  </device>
+  <switchlist>
+    <!-- VB: the mux_trans_size and buf_size data below is in minimum width transistor *areas*, assuming the purple
+           book area formula. This means the mux transistors are about 5x minimum drive strength.
+           We assume the first stage of the buffer is 3x min drive strength to be reasonable given the large 
+           mux transistors, and this gives a reasonable stage ratio of a bit over 5x to the second stage. We assume
+           the n and p transistors in the first stage are equal-sized to lower the buffer trip point, since it's fed
+           by a pass transistor mux. We can then reverse engineer the buffer second stage to hit the specified 
+           buf_size (really buffer area) - 16.2x minimum drive nmos and 1.8*16.2 = 29.2x minimum drive.
+           I then took the data from Jeff G.'s PTM modeling of 45 nm to get the Cin (gate of first stage) and Cout 
+           (diff of second stage) listed below.  Jeff's models are in tech/ptm_45nm, and are in min feature multiples.
+           The minimum contactable transistor is 2.5 * 45 nm, so I need to multiply the drive strength sizes above by 
+           2.5x when looking up in Jeff's tables.
+           Finally, we choose a switch delay (58 ps) that leads to length 4 wires having a delay equal to that of SIV of 126 ps.
+           This also leads to the switch being 46% of the total wire delay, which is reasonable. -->
+    <switch type="mux" name="0" R="551" Cin=".77e-15" Cout="4e-15" Tdel="58e-12" mux_trans_size="2.630740" buf_size="27.645901"/>
+    <!--switch ipin_cblock resistance set to yeild for 4x minimum drive strength buffer-->
+    <switch type="mux" name="ipin_cblock" R="2231.5" Cout="0." Cin="1.47e-15" Tdel="7.247000e-11" mux_trans_size="1.222260" buf_size="auto"/>
+  </switchlist>
+  <segmentlist>
+    <!--- VB & JL: using ITRS metal stack data, 96 nm half pitch wires, which are intermediate metal width/space.  
+             With the 96 nm half pitch, such wires would take 60 um of height, vs. a 90 nm high (approximated as square) Stratix IV tile so this seems
+             reasonable. Using a tile length of 90 nm, corresponding to the length of a Stratix IV tile if it were square. -->
+    <segment freq="1.000000" length="4" type="unidir" Rmetal="101" Cmetal="22.5e-15">
+      <mux name="0"/>
+      <sb type="pattern">1 1 1 1 1</sb>
+      <cb type="pattern">1 1 1 1</cb>
+    </segment>
+  </segmentlist>
+  <directlist>
+    <direct name="adder_carry" from_pin="clb.cout" to_pin="clb.cin" x_offset="0" y_offset="-1" z_offset="0"/>
+  </directlist>
+  <complexblocklist>
+    <!-- Define I/O pads begin -->
+    <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA -->
+    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+    <pb_type name="io">
+      <input name="outpad" num_pins="1"/>
+      <output name="inpad" num_pins="1"/>
+      <clock name="clock" num_pins="1"/>
+      <!-- IOs can operate as either inputs or outputs.
+	     Maximum delays below come from Ian Kuon. They are small, so they should be interpreted as
+	     the delays to and from registers in the I/O (and generally I/Os are registered 
+	     today and that is when you timing analyze them.
+
+		 Minimum delays are retrieved using a ratio of maximum and minimum times as seen in Quartus II
+		 in Stratix IV. The ratio of minimum value/maximum value is as follows:
+			inpad delay:  0.9239
+			outpad delay: 0.9545
+
+	     -->
+      <mode name="inpad">
+        <pb_type name="inpad" blif_model=".input" num_pb="1">
+          <output name="inpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" min="3.92e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="outpad">
+        <pb_type name="outpad" blif_model=".output" num_pb="1">
+          <input name="outpad" num_pins="1"/>
+        </pb_type>
+        <interconnect>
+          <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" min="1.331e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+      <!-- Place I/Os on the sides of the FPGA -->
+      <power method="ignore"/>
+    </pb_type>
+    <!-- Define I/O pads ends -->
+    <!-- Define general purpose logic block (CLB) begin -->
+    <!--- Area calculation: Total Stratix IV tile area is about 8100 um^2, and a minimum width transistor 
+	   area is 60 L^2 yields a tile area of 84375 MWTAs.
+	   Routing at W=300 is 30481 MWTAs, leaving us with a total of 53000 MWTAs for logic block area 
+	   This means that only 37% of our area is in the general routing, and 63% is inside the logic
+	   block. Note that the crossbar / local interconnect is considered part of the logic block
+	   area in this analysis. That is a lower proportion of of routing area than most academics
+	   assume, but note that the total routing area really includes the crossbar, which would push
+	   routing area up significantly, we estimate into the ~70% range. 
+	   -->
+    <pb_type name="clb">
+      <input name="I" num_pins="40" equivalent="full"/>
+      <input name="cin" num_pins="1"/>
+      <output name="O" num_pins="20" equivalent="none"/>
+      <output name="cout" num_pins="1"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Describe fracturable logic element.  
+             Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
+             The outputs of the fracturable logic element can be optionally registered
+        -->
+      <pb_type name="fle" num_pb="10">
+        <input name="in" num_pins="6"/>
+        <input name="cin" num_pins="1"/>
+        <output name="out" num_pins="2"/>
+        <output name="cout" num_pins="1"/>
+        <clock name="clk" num_pins="1"/>
+        <mode name="n2_lut5">
+          <pb_type name="lut5inter" num_pb="1">
+            <input name="in" num_pins="5"/>
+            <input name="cin" num_pins="1"/>
+            <output name="out" num_pins="2"/>
+            <output name="cout" num_pins="1"/>
+            <clock name="clk" num_pins="1"/>
+            <pb_type name="ble5" num_pb="2">
+              <input name="in" num_pins="5"/>
+              <input name="cin" num_pins="1"/>
+              <output name="out" num_pins="1"/>
+              <output name="cout" num_pins="1"/>
+              <clock name="clk" num_pins="1"/>
+              <mode name="blut5">
+                <pb_type name="flut5" num_pb="1">
+                  <input name="in" num_pins="5"/>
+                  <output name="out" num_pins="1"/>
+                  <clock name="clk" num_pins="1"/>
+                  <!-- Regular LUT mode -->
+                  <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                    <input name="in" num_pins="5" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical maximum delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                        82e-12
+                        173e-12
+                        261e-12
+                        263e-12
+                        398e-12
+							The minimum delay/maximum delay ratio is 0.7395 in QII on Stratix IV. Hence, the minimum delay
+						is 0.7295 * the average of the maximum numbers
+                        -->
+                    <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                        235e-12
+                        235e-12
+                        235e-12
+                        235e-12
+                        235e-12
+                      </delay_matrix>
+                    <delay_matrix type="min" in_port="lut5.in" out_port="lut5.out">
+                        174e-12
+                        174e-12
+                        174e-12
+                        174e-12
+                        174e-12
+                      </delay_matrix>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="ff.D" clock="clk"/>
+                    <T_hold value="37e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="124e-12" min="60e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="direct1" input="flut5.in" output="lut5.in"/>
+                    <direct name="direct2" input="lut5.out" output="ff.D">
+                      <pack_pattern name="ble5" in_port="lut5.out" out_port="ff.D"/>
+                    </direct>
+                    <direct name="direct3" input="flut5.clk" output="ff.clk"/>
+                    <mux name="mux1" input="ff.Q lut5.out" output="flut5.out">
+                      <delay_constant max="25e-12" min="24e-12" in_port="lut5.out" out_port="flut5.out"/>
+                      <delay_constant max="45e-12" min="27e-12" in_port="ff.Q" out_port="flut5.out"/>
+                    </mux>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <direct name="direct1" input="ble5.in" output="flut5.in"/>
+                  <direct name="direct2" input="ble5.clk" output="flut5.clk"/>
+                  <direct name="direct3" input="flut5.out" output="ble5.out"/>
+                </interconnect>
+              </mode>
+              <mode name="arithmetic">
+                <pb_type name="arithmetic" num_pb="1">
+                  <input name="in" num_pins="4"/>
+                  <input name="cin" num_pins="1"/>
+                  <output name="out" num_pins="1"/>
+                  <output name="cout" num_pins="1"/>
+                  <clock name="clk" num_pins="1"/>
+                  <!-- Special dual-LUT mode that drives adder only -->
+                  <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut">
+                    <input name="in" num_pins="4" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                             we instead take the average of these numbers to get more stable results
+                        82e-12
+                        173e-12
+                        261e-12
+                        263e-12
+							The minimum delay/maximum delay ratio is 0.7395 in QII on Stratix IV. Hence, the minimum delay
+						is 0.7295 * the average of the maximum numbers
+                        -->
+                    <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                        195e-12
+                        195e-12
+                        195e-12
+                        195e-12
+                      </delay_matrix>
+                    <delay_matrix type="min" in_port="lut4.in" out_port="lut4.out">
+                        144e-12
+                        144e-12
+                        144e-12
+                        144e-12
+                      </delay_matrix>
+                  </pb_type>
+                  <!-- The ratio between minimum and maximum delays in StratixIV for data ports to sumout
+							is 0.6809 and cin to sumout is 0.6969-->
+                  <pb_type name="adder" blif_model=".subckt adder" num_pb="1">
+                    <input name="a" num_pins="1"/>
+                    <input name="b" num_pins="1"/>
+                    <input name="cin" num_pins="1"/>
+                    <output name="cout" num_pins="1"/>
+                    <output name="sumout" num_pins="1"/>
+                    <delay_constant max="0.3e-9" min="0.2043e-9" in_port="adder.a" out_port="adder.sumout"/>
+                    <delay_constant max="0.3e-9" min="0.2043e-9" in_port="adder.b" out_port="adder.sumout"/>
+                    <delay_constant max="0.3e-9" min="0.2043e-9" in_port="adder.cin" out_port="adder.sumout"/>
+                    <delay_constant max="0.3e-9" min="0.2043e-9" in_port="adder.a" out_port="adder.cout"/>
+                    <delay_constant max="0.3e-9" min="0.2043e-9" in_port="adder.b" out_port="adder.cout"/>
+                    <delay_constant max="0.01e-9" min="6.9797e-12" in_port="adder.cin" out_port="adder.cout"/>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="66e-12" port="ff.D" clock="clk"/>
+                    <T_hold value="37e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="124e-12" min="60e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="clock" input="arithmetic.clk" output="ff.clk"/>
+                    <direct name="lut_in1" input="arithmetic.in[3:0]" output="lut4[0:0].in[3:0]"/>
+                    <direct name="lut_in2" input="arithmetic.in[3:0]" output="lut4[1:1].in[3:0]"/>
+                    <direct name="lut_to_add1" input="lut4[0:0].out" output="adder.a">
+                      </direct>
+                    <direct name="lut_to_add2" input="lut4[1:1].out" output="adder.b">
+                      </direct>
+                    <direct name="add_to_ff" input="adder.sumout" output="ff.D">
+                      <pack_pattern name="chain" in_port="adder.sumout" out_port="ff.D"/>
+                    </direct>
+                    <direct name="carry_in" input="arithmetic.cin" output="adder.cin">
+                      <pack_pattern name="chain" in_port="arithmetic.cin" out_port="adder.cin"/>
+                    </direct>
+                    <direct name="carry_out" input="adder.cout" output="arithmetic.cout">
+                      <pack_pattern name="chain" in_port="adder.cout" out_port="arithmetic.cout"/>
+                    </direct>
+                    <mux name="sumout" input="ff.Q adder.sumout" output="arithmetic.out">
+                      <delay_constant max="25e-12" min="24e-12" in_port="adder.sumout" out_port="arithmetic.out"/>
+                      <delay_constant max="45e-12" min="27e-12" in_port="ff.Q" out_port="arithmetic.out"/>
+                    </mux>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <direct name="direct1" input="ble5.in[3:0]" output="arithmetic.in"/>
+                  <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                    <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                  </direct>
+                  <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                    <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                  </direct>
+                  <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                  <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                </interconnect>
+              </mode>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="lut5inter.in" output="ble5[0:0].in"/>
+              <direct name="direct2" input="lut5inter.in" output="ble5[1:1].in"/>
+              <direct name="direct3" input="ble5[1:0].out" output="lut5inter.out"/>
+              <direct name="carry_in" input="lut5inter.cin" output="ble5[0:0].cin">
+                <pack_pattern name="chain" in_port="lut5inter.cin" out_port="ble5[0:0].cin"/>
+              </direct>
+              <direct name="carry_out" input="ble5[1:1].cout" output="lut5inter.cout">
+                <pack_pattern name="chain" in_port="ble5[1:1].cout" out_port="lut5inter.cout"/>
+              </direct>
+              <direct name="carry_link" input="ble5[0:0].cout" output="ble5[1:1].cin">
+                <pack_pattern name="chain" in_port="ble5[0:0].cout" out_port="ble5[1:1].cout"/>
+              </direct>
+              <complete name="complete1" input="lut5inter.clk" output="ble5[1:0].clk"/>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in[4:0]" output="lut5inter.in"/>
+            <direct name="direct2" input="lut5inter.out" output="fle.out"/>
+            <direct name="direct3" input="fle.clk" output="lut5inter.clk"/>
+            <direct name="carry_in" input="fle.cin" output="lut5inter.cin">
+              <pack_pattern name="chain" in_port="fle.cin" out_port="lut5inter.cin"/>
+            </direct>
+            <direct name="carry_out" input="lut5inter.cout" output="fle.cout">
+              <pack_pattern name="chain" in_port="lut5inter.cout" out_port="fle.cout"/>
+            </direct>
+          </interconnect>
+        </mode>
+        <!-- n2_lut5 -->
+        <mode name="n1_lut6">
+          <pb_type name="ble6" num_pb="1">
+            <input name="in" num_pins="6"/>
+            <output name="out" num_pins="1"/>
+            <clock name="clk" num_pins="1"/>
+            <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+              <input name="in" num_pins="6" port_class="lut_in"/>
+              <output name="out" num_pins="1" port_class="lut_out"/>
+              <!-- LUT timing using delay matrix -->
+              <!-- These are the physical delay inputs on a Stratix IV LUT but because VPR cannot do LUT rebalancing,
+                       we instead take the average of these numbers to get more stable results
+                  82e-12
+                  173e-12
+                  261e-12
+                  263e-12
+                  398e-12
+                  397e-12
+
+					The minimum delay/maximum delay ratio is 0.7395 in QII on Stratix IV. Hence, the minimum delay
+						is 0.7295 * the average of the maximum numbers
+                  -->
+              <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                  261e-12
+                  261e-12
+                  261e-12
+                  261e-12
+                  261e-12
+                  261e-12
+                </delay_matrix>
+              <delay_matrix type="min" in_port="lut6.in" out_port="lut6.out">
+                  174e-12
+                  174e-12
+                  174e-12
+                  174e-12
+                  174e-12
+                  174e-12
+                </delay_matrix>
+            </pb_type>
+            <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+              <input name="D" num_pins="1" port_class="D"/>
+              <output name="Q" num_pins="1" port_class="Q"/>
+              <clock name="clk" num_pins="1" port_class="clock"/>
+              <T_setup value="66e-12" port="ff.D" clock="clk"/>
+              <T_hold value="37e-12" port="ff.D" clock="clk"/>
+              <T_clock_to_Q max="124e-12" min="60e-12" port="ff.Q" clock="clk"/>
+            </pb_type>
+            <interconnect>
+              <direct name="direct1" input="ble6.in" output="lut6[0:0].in"/>
+              <direct name="direct2" input="lut6.out" output="ff.D">
+                <pack_pattern name="ble6" in_port="lut6.out" out_port="ff.D"/>
+              </direct>
+              <direct name="direct3" input="ble6.clk" output="ff.clk"/>
+              <mux name="mux1" input="ff.Q lut6.out" output="ble6.out">
+                <delay_constant max="25e-12" min="24e-12" in_port="lut6.out" out_port="ble6.out"/>
+                <delay_constant max="45e-12" min="27e-12" in_port="ff.Q" out_port="ble6.out"/>
+              </mux>
+            </interconnect>
+          </pb_type>
+          <interconnect>
+            <direct name="direct1" input="fle.in" output="ble6.in"/>
+            <direct name="direct2" input="ble6.out" output="fle.out[0:0]"/>
+            <direct name="direct3" input="fle.clk" output="ble6.clk"/>
+          </interconnect>
+        </mode>
+        <!-- n1_lut6 -->
+      </pb_type>
+      <interconnect>
+        <!-- We use a full crossbar to get logical equivalence at inputs of CLB 
+           The delays below come from Stratix IV. the delay through a connection block
+           input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps 
+           delay on the connection block input mux (modeled by Ian Kuon), so the remaining
+           delay within the crossbar is 95 ps. 
+		   For the minimum delays, we have the delay through a connection block input mux +
+		   the crossbar in Stratix IV is 144. Subtracting the 72 ps leaves 72 ps remaining.
+           The max delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT.
+           Since all our outputs LUT outputs go to a BLE output, and have a delay of 
+           25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback
+           to get the part that should be marked on the crossbar. For the minimum delay,
+  		   the value in Stratix IV is 93 ps, subtracting the 24 ps leaves 69 ps.-->
+        <complete name="crossbar" input="clb.I fle[9:0].out" output="fle[9:0].in">
+          <delay_constant max="95e-12" min="72e-12" in_port="clb.I" out_port="fle[9:0].in"/>
+          <delay_constant max="75e-12" min="69e-12" in_port="fle[9:0].out" out_port="fle[9:0].in"/>
+        </complete>
+        <complete name="clks" input="clb.clk" output="fle[9:0].clk">
+          </complete>
+        <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
+                 By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+                 then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+                 naive specification).
+          -->
+        <direct name="clbouts1" input="fle[9:0].out[0:0]" output="clb.O[9:0]"/>
+        <direct name="clbouts2" input="fle[9:0].out[1:1]" output="clb.O[19:10]"/>
+        <!-- Carry chain links -->
+        <direct name="carry_in" input="clb.cin" output="fle[0:0].cin">
+          <!-- Put all inter-block carry chain delay on this one edge -->
+          <delay_constant max="0.16e-9" min="0.11e-9" in_port="clb.cin" out_port="fle[0:0].cin"/>
+          <pack_pattern name="chain" in_port="clb.cin" out_port="fle[0:0].cin"/>
+        </direct>
+        <direct name="carry_out" input="fle[9:9].cout" output="clb.cout">
+          <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="clb.cout"/>
+        </direct>
+        <direct name="carry_link" input="fle[8:0].cout" output="fle[9:1].cin">
+          <pack_pattern name="chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+        </direct>
+      </interconnect>
+    </pb_type>
+    <!-- Define general purpose logic block (CLB) ends -->
+    <!-- Define fracturable multiplier begin -->
+    <!-- This multiplier can operate as a 36x36 multiplier that can fracture to two 18x18 multipliers each of which can further fracture to two 9x9 multipliers 
+	   For delay modelling, the 36x36 DSP multiplier in Stratix IV has a maximum delay of 1.523 ns + 1.93 ns
+	    = 3.45 ns. The average difference between the maximum and minimum values of a dsp mac out in 36 bit multiply
+		mode is 0.51. Hence, the minimum delay is modeled as 0.776 ns + 0.984 ns = 1.760 ns.
+ 		The 18x18 mode doesn't need to sum four 18x18 multipliers, so it is a bit
+	   faster: 1.523 ns for the multiplier, and 1.09 ns for the multiplier output block.
+	    For the input and output interconnect delays, unlike Stratix IV, we don't
+	   have any routing/logic flexibility (crossbars) at the inputs.  There is some output muxing
+	   in Stratix IV and this architecture to select which multiplier outputs should go out (e.g.
+	   9x9 outputs, 18x18 or 36x36) so those are very close between the two architectures. 
+	   We take the conservative (slightly pessimistic)
+           approach modelling the input as the same as the Stratix IV input delay and the output delay the same as the Stratix IV DSP out delay.
+		   
+	   We estimate block area by using the published Stratix III data (which is architecturally identical to Stratix IV)
+	      (H. Wong, V. Betz and J. Rose, "Comparing FPGA vs. Custom CMOS and the Impact on Processor Microarchitecture", FPGA 2011) of 0.2623 
+		  mm^2 and scaling from 65 to 40 nm to obtain 0.0993 mm^2. That area is for a DSP block with approximately 2x the functionality of 
+		  the block we use (can implement two 36x36 multiplies instead of our 1, eight 18x18 multiplies instead of our 4, etc.). Hence we 
+		  divide the area by 2 to obtain 0.0497 mm^2. One minimum-width transistor units = 60 L^2 (where L = 40 nm), so is 518,000 MWTUS. 
+		  That area includes routing and the connection block input muxes.  Our DSP block is four 
+		  rows high, and hence includes four horizontal routing channel segments and four vertical ones, which is 4x the routing of a logic 
+		  block (single tile). It also includes 3.6x the outputs of a logic block, and 1.8x the inputs. Hence a slight overestimate of the routing
+		  area associated with our DSP block is four times that of a logic tile, where the routing area of a logic tile was calculated above (at W = 300)
+		  as 30481 MWTAs. Hence the (core, non-routing) area our DSP block is approximately 518,000 - 4 * 30,481 = 396,000 MWTUs.
+      -->
+    <pb_type name="mult_36">
+      <input name="a" num_pins="36"/>
+      <input name="b" num_pins="36"/>
+      <output name="out" num_pins="72"/>
+      <mode name="two_divisible_mult_18x18">
+        <pb_type name="divisible_mult_18x18" num_pb="2">
+          <input name="a" num_pins="18"/>
+          <input name="b" num_pins="18"/>
+          <output name="out" num_pins="36"/>
+          <!-- Model 9x9 delay and 18x18 delay as the same.  9x9 could be faster, but in Stratix IV
+	          isn't, presumably because the multiplier layout is really optimized for 18x18.
+		-->
+          <mode name="two_mult_9x9">
+            <pb_type name="mult_9x9_slice" num_pb="2">
+              <input name="A_cfg" num_pins="9"/>
+              <input name="B_cfg" num_pins="9"/>
+              <output name="OUT_cfg" num_pins="18"/>
+              <pb_type name="mult_9x9" blif_model=".subckt multiply" num_pb="1">
+                <input name="a" num_pins="9"/>
+                <input name="b" num_pins="9"/>
+                <output name="out" num_pins="18"/>
+                <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_9x9.a" out_port="mult_9x9.out"/>
+                <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_9x9.b" out_port="mult_9x9.out"/>
+              </pb_type>
+              <interconnect>
+                <direct name="a2a" input="mult_9x9_slice.A_cfg" output="mult_9x9.a">
+                </direct>
+                <direct name="b2b" input="mult_9x9_slice.B_cfg" output="mult_9x9.b">
+                </direct>
+                <direct name="out2out" input="mult_9x9.out" output="mult_9x9_slice.OUT_cfg">
+                </direct>
+              </interconnect>
+              <power method="pin-toggle">
+                <port name="A_cfg" energy_per_toggle="1.45e-12"/>
+                <port name="B_cfg" energy_per_toggle="1.45e-12"/>
+                <static_power power_per_instance="0.0"/>
+              </power>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="divisible_mult_18x18.a" output="mult_9x9_slice[1:0].A_cfg">
+              </direct>
+              <direct name="b2b" input="divisible_mult_18x18.b" output="mult_9x9_slice[1:0].B_cfg">
+              </direct>
+              <direct name="out2out" input="mult_9x9_slice[1:0].OUT_cfg" output="divisible_mult_18x18.out">
+              </direct>
+            </interconnect>
+          </mode>
+          <mode name="mult_18x18">
+            <pb_type name="mult_18x18_slice" num_pb="1">
+              <input name="A_cfg" num_pins="18"/>
+              <input name="B_cfg" num_pins="18"/>
+              <output name="OUT_cfg" num_pins="36"/>
+              <pb_type name="mult_18x18" blif_model=".subckt multiply" num_pb="1">
+                <input name="a" num_pins="18"/>
+                <input name="b" num_pins="18"/>
+                <output name="out" num_pins="36"/>
+                <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_18x18.a" out_port="mult_18x18.out"/>
+                <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_18x18.b" out_port="mult_18x18.out"/>
+              </pb_type>
+              <interconnect>
+                <direct name="a2a" input="mult_18x18_slice.A_cfg" output="mult_18x18.a">
+                </direct>
+                <direct name="b2b" input="mult_18x18_slice.B_cfg" output="mult_18x18.b">
+                </direct>
+                <direct name="out2out" input="mult_18x18.out" output="mult_18x18_slice.OUT_cfg">
+                </direct>
+              </interconnect>
+              <power method="pin-toggle">
+                <port name="A_cfg" energy_per_toggle="1.09e-12"/>
+                <port name="B_cfg" energy_per_toggle="1.09e-12"/>
+                <static_power power_per_instance="0.0"/>
+              </power>
+            </pb_type>
+            <interconnect>
+              <direct name="a2a" input="divisible_mult_18x18.a" output="mult_18x18_slice.A_cfg">
+              </direct>
+              <direct name="b2b" input="divisible_mult_18x18.b" output="mult_18x18_slice.B_cfg">
+              </direct>
+              <direct name="out2out" input="mult_18x18_slice.OUT_cfg" output="divisible_mult_18x18.out">
+              </direct>
+            </interconnect>
+          </mode>
+          <power method="sum-of-children"/>
+        </pb_type>
+        <interconnect>
+          <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. 
+		   Subtract 72.5 ps delay, which is already in the connection block input mux, leading 134 ps
+				The interconnect difference for DSP blocks is 0.5523, which leads to a minimum delay of 74 ps
+              -->
+          <direct name="a2a" input="mult_36.a" output="divisible_mult_18x18[1:0].a">
+            <delay_constant max="134e-12" min="74e-12" in_port="mult_36.a" out_port="divisible_mult_18x18[1:0].a"/>
+          </direct>
+          <direct name="b2b" input="mult_36.b" output="divisible_mult_18x18[1:0].b">
+            <delay_constant max="134e-12" min="74e-12" in_port="mult_36.b" out_port="divisible_mult_18x18[1:0].b"/>
+          </direct>
+          <direct name="out2out" input="divisible_mult_18x18[1:0].out" output="mult_36.out">
+            <delay_constant max="1.09e-9" min="74e-12" in_port="divisible_mult_18x18[1:0].out" out_port="mult_36.out"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mult_36x36">
+        <pb_type name="mult_36x36_slice" num_pb="1">
+          <input name="A_cfg" num_pins="36"/>
+          <input name="B_cfg" num_pins="36"/>
+          <output name="OUT_cfg" num_pins="72"/>
+          <pb_type name="mult_36x36" blif_model=".subckt multiply" num_pb="1">
+            <input name="a" num_pins="36"/>
+            <input name="b" num_pins="36"/>
+            <output name="out" num_pins="72"/>
+            <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_36x36.a" out_port="mult_36x36.out"/>
+            <delay_constant max="1.523e-9" min="0.776e-9" in_port="mult_36x36.b" out_port="mult_36x36.out"/>
+          </pb_type>
+          <interconnect>
+            <direct name="a2a" input="mult_36x36_slice.A_cfg" output="mult_36x36.a">
+            </direct>
+            <direct name="b2b" input="mult_36x36_slice.B_cfg" output="mult_36x36.b">
+            </direct>
+            <direct name="out2out" input="mult_36x36.out" output="mult_36x36_slice.OUT_cfg">
+            </direct>
+          </interconnect>
+          <power method="pin-toggle">
+            <port name="A_cfg" energy_per_toggle="2.13e-12"/>
+            <port name="B_cfg" energy_per_toggle="2.13e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. 
+		   Subtract 72.5 ps delay, which is already in the connection block input mux, leading
+		   to a 134 ps delay.
+				The interconnect difference for DSP blocks is 0.5523, which leads to a minimum delay of 74 ps
+              -->
+          <direct name="a2a" input="mult_36.a" output="mult_36x36_slice.A_cfg">
+            <delay_constant max="134e-12" min="74e-12" in_port="mult_36.a" out_port="mult_36x36_slice.A_cfg"/>
+          </direct>
+          <direct name="b2b" input="mult_36.b" output="mult_36x36_slice.B_cfg">
+            <delay_constant max="134e-12" min="74e-12" in_port="mult_36.b" out_port="mult_36x36_slice.B_cfg"/>
+          </direct>
+          <direct name="out2out" input="mult_36x36_slice.OUT_cfg" output="mult_36.out">
+            <delay_constant max="1.93e-9" min="74e-12" in_port="mult_36x36_slice.OUT_cfg" out_port="mult_36.out"/>
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Place this multiplier block every 8 columns from (and including) the sixth column -->
+      <power method="sum-of-children"/>
+    </pb_type>
+    <!-- Define fracturable multiplier end -->
+    <!-- Define fracturable memory begin -->
+    <!-- 32 Kb Memory that can operate from 512x64 to 32Kx1 for single-port mode and 1024x32 to 32Kx1 for dual-port mode.  
+           Area and max delay based off Stratix IV 9K and 144K memories (delay from linear interpolation, Tsu(483 ps, 636 ps) Tco(1084ps, 1969ps)).  
+
+		   uTh/Tsu ratio = 0.468, uTco min/max ratio = 0.97
+           uTh = 226ps, 298ps
+           Min uTco = 1051ps, 1909ps
+           Max input delay = 204ps (from Stratix IV LAB line) - 72ps (this architecture does not lump connection box delay in internal delay)
+           Max output delay = M9K buffer 50ps. 
+		   Min input delay = 160ps (from Stratix IV Lab line) - 72ps
+		   Min output delay = 46ps (M9K buffer min/max ratio = 0.9286)
+		   
+		   Area is obtained by appropriately scaling and adjusting the published Stratix III (which is architecturally identical to Stratix IV)
+		   data from H. Wong, V. Betz and J. Rose, "Comparing FPGA vs. Custom CMOS and the Impact on Processor Microarchitecture", FPGA 2011.
+		   Linearly interpolating (by bit count) between the M9k and M144k areas to obtain an M32k (our RAM size) point yields a 65 nm area of
+		   of 0.153 mm^2. Interpolating based on port count between the RAMs would instead yield an area of 0.209 mm^2 for our 32 kB RAM; since 
+		   bit count accounts for more area than ports for a RAM this size we choose the bit count interpolation; however, since the port interpolation
+		   is not radically different this also gives us confidence that interpolating based on bits is OK, but slightly underpredicts area.
+		   Scaling to 40 nm^2 yields .0579 mm^2, and converting to MWTUs at 60 L^2 / MWTU yields 604,000 MWTUs. This includes routing. A Stratix IV
+		   M9K RAM is one row high and hence has one routing tile (one horizonal and one vertical routing segment area). An M144k RAM has 8 such tiles.
+		   Linearly interpolating on
+		   bits to 32 kb yields 2.2 routing tiles incorporated in the area number above. The inter-block routing represents 30% of the area of a logic 
+		   tile according to D. Lewis et al, "Architectural Enhancements in Stratix V," FPGA 2013. Hence we should subtract 0.3 * 2.2 * 84,375 MWTUs to
+		   obtain a RAM core area (not including inter-block routing) of 548,000 MWTU areas for our 32 kb RAM in a 40 nm process.
+      -->
+    <pb_type name="memory">
+      <input name="addr1" num_pins="15"/>
+      <input name="addr2" num_pins="15"/>
+      <input name="data" num_pins="64"/>
+      <input name="we1" num_pins="1"/>
+      <input name="we2" num_pins="1"/>
+      <output name="out" num_pins="64"/>
+      <clock name="clk" num_pins="1"/>
+      <!-- Specify single port mode first -->
+      <mode name="mem_512x64_sp">
+        <pb_type name="mem_512x64_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="9" port_class="address"/>
+          <input name="data" num_pins="64" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="64" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_512x64_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_512x64_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_512x64_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_512x64_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_512x64_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_512x64_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_512x64_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[8:0]" output="mem_512x64_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[8:0]" out_port="mem_512x64_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[63:0]" output="mem_512x64_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[63:0]" out_port="mem_512x64_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_512x64_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_512x64_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_512x64_sp.out" output="memory.out[63:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_512x64_sp.out" out_port="memory.out[63:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_512x64_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_1024x32_sp">
+        <pb_type name="mem_1024x32_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="10" port_class="address"/>
+          <input name="data" num_pins="32" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="32" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_1024x32_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_1024x32_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x32_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x32_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[31:0]" output="mem_1024x32_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[31:0]" out_port="mem_1024x32_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x32_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_1024x32_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_1024x32_sp.out" output="memory.out[31:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_1024x32_sp.out" out_port="memory.out[31:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x32_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_2048x16_sp">
+        <pb_type name="mem_2048x16_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="11" port_class="address"/>
+          <input name="data" num_pins="16" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="16" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_2048x16_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_2048x16_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x16_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x16_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[15:0]" output="mem_2048x16_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[15:0]" out_port="mem_2048x16_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x16_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_2048x16_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_2048x16_sp.out" output="memory.out[15:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_2048x16_sp.out" out_port="memory.out[15:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x16_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_4096x8_sp">
+        <pb_type name="mem_4096x8_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="12" port_class="address"/>
+          <input name="data" num_pins="8" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="8" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_4096x8_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_4096x8_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_4096x8_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_4096x8_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_4096x8_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_4096x8_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_4096x8_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[11:0]" output="mem_4096x8_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[11:0]" out_port="mem_4096x8_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[7:0]" output="mem_4096x8_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[7:0]" out_port="mem_4096x8_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_4096x8_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_4096x8_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_4096x8_sp.out" output="memory.out[7:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_4096x8_sp.out" out_port="memory.out[7:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_4096x8_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_8192x4_sp">
+        <pb_type name="mem_8192x4_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="13" port_class="address"/>
+          <input name="data" num_pins="4" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="4" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_8192x4_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_8192x4_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[12:0]" output="mem_8192x4_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[12:0]" out_port="mem_8192x4_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[3:0]" output="mem_8192x4_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[3:0]" out_port="mem_8192x4_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_8192x4_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_8192x4_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_8192x4_sp.out" output="memory.out[3:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_8192x4_sp.out" out_port="memory.out[3:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_8192x4_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_16384x2_sp">
+        <pb_type name="mem_16384x2_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="14" port_class="address"/>
+          <input name="data" num_pins="2" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="2" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_16384x2_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_16384x2_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[13:0]" output="mem_16384x2_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[13:0]" out_port="mem_16384x2_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[1:0]" output="mem_16384x2_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[1:0]" out_port="mem_16384x2_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_16384x2_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_16384x2_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_16384x2_sp.out" output="memory.out[1:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_16384x2_sp.out" out_port="memory.out[1:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_16384x2_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_32768x1_sp">
+        <pb_type name="mem_32768x1_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+          <input name="addr" num_pins="15" port_class="address"/>
+          <input name="data" num_pins="1" port_class="data_in"/>
+          <input name="we" num_pins="1" port_class="write_en"/>
+          <output name="out" num_pins="1" port_class="data_out"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_32768x1_sp.addr" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_sp.data" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_sp.we" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_sp.addr" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_sp.data" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_sp.we" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_32768x1_sp.out" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="9.0e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[14:0]" output="mem_32768x1_sp.addr">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[14:0]" out_port="mem_32768x1_sp.addr"/>
+          </direct>
+          <direct name="data1" input="memory.data[0:0]" output="mem_32768x1_sp.data">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[0:0]" out_port="mem_32768x1_sp.data"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_32768x1_sp.we">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_32768x1_sp.we"/>
+          </direct>
+          <direct name="dataout1" input="mem_32768x1_sp.out" output="memory.out[0:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_32768x1_sp.out" out_port="memory.out[0:0]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_32768x1_sp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Specify true dual port mode next -->
+      <mode name="mem_1024x32_dp">
+        <pb_type name="mem_1024x32_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="10" port_class="address1"/>
+          <input name="addr2" num_pins="10" port_class="address2"/>
+          <input name="data1" num_pins="32" port_class="data_in1"/>
+          <input name="data2" num_pins="32" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="32" port_class="data_out1"/>
+          <output name="out2" num_pins="32" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_1024x32_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_1024x32_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_1024x32_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_1024x32_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x32_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x32_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x32_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[9:0]" out_port="mem_1024x32_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[31:0]" output="mem_1024x32_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[31:0]" out_port="mem_1024x32_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[63:32]" output="mem_1024x32_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[63:32]" out_port="mem_1024x32_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_1024x32_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_1024x32_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_1024x32_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_1024x32_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_1024x32_dp.out1" output="memory.out[31:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_1024x32_dp.out1" out_port="memory.out[31:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_1024x32_dp.out2" output="memory.out[63:32]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_1024x32_dp.out2" out_port="memory.out[63:32]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_1024x32_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_2048x16_dp">
+        <pb_type name="mem_2048x16_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="11" port_class="address1"/>
+          <input name="addr2" num_pins="11" port_class="address2"/>
+          <input name="data1" num_pins="16" port_class="data_in1"/>
+          <input name="data2" num_pins="16" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="16" port_class="data_out1"/>
+          <output name="out2" num_pins="16" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x16_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x16_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_2048x16_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_2048x16_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x16_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x16_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x16_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[10:0]" out_port="mem_2048x16_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[15:0]" output="mem_2048x16_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[15:0]" out_port="mem_2048x16_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[31:16]" output="mem_2048x16_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[31:16]" out_port="mem_2048x16_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x16_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_2048x16_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_2048x16_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_2048x16_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_2048x16_dp.out1" output="memory.out[15:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_2048x16_dp.out1" out_port="memory.out[15:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_2048x16_dp.out2" output="memory.out[31:16]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_2048x16_dp.out2" out_port="memory.out[31:16]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x16_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_2048x8_dp">
+        <pb_type name="mem_2048x8_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="12" port_class="address1"/>
+          <input name="addr2" num_pins="12" port_class="address2"/>
+          <input name="data1" num_pins="8" port_class="data_in1"/>
+          <input name="data2" num_pins="8" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="8" port_class="data_out1"/>
+          <output name="out2" num_pins="8" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_2048x8_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_2048x8_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_2048x8_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_2048x8_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[11:0]" output="mem_2048x8_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[11:0]" out_port="mem_2048x8_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[11:0]" output="mem_2048x8_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[11:0]" out_port="mem_2048x8_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[7:0]" output="mem_2048x8_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[7:0]" out_port="mem_2048x8_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[15:8]" output="mem_2048x8_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[15:8]" out_port="mem_2048x8_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_2048x8_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_2048x8_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_2048x8_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_2048x8_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_2048x8_dp.out1" output="memory.out[7:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_2048x8_dp.out1" out_port="memory.out[7:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_2048x8_dp.out2" output="memory.out[15:8]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_2048x8_dp.out2" out_port="memory.out[15:8]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_2048x8_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_8192x4_dp">
+        <pb_type name="mem_8192x4_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="13" port_class="address1"/>
+          <input name="addr2" num_pins="13" port_class="address2"/>
+          <input name="data1" num_pins="4" port_class="data_in1"/>
+          <input name="data2" num_pins="4" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="4" port_class="data_out1"/>
+          <output name="out2" num_pins="4" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_8192x4_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_8192x4_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_8192x4_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_8192x4_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[12:0]" output="mem_8192x4_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[12:0]" out_port="mem_8192x4_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[12:0]" output="mem_8192x4_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[12:0]" out_port="mem_8192x4_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[3:0]" output="mem_8192x4_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[3:0]" out_port="mem_8192x4_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[7:4]" output="mem_8192x4_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[7:4]" out_port="mem_8192x4_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_8192x4_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_8192x4_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_8192x4_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_8192x4_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_8192x4_dp.out1" output="memory.out[3:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_8192x4_dp.out1" out_port="memory.out[3:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_8192x4_dp.out2" output="memory.out[7:4]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_8192x4_dp.out2" out_port="memory.out[7:4]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_8192x4_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_16384x2_dp">
+        <pb_type name="mem_16384x2_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="14" port_class="address1"/>
+          <input name="addr2" num_pins="14" port_class="address2"/>
+          <input name="data1" num_pins="2" port_class="data_in1"/>
+          <input name="data2" num_pins="2" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="2" port_class="data_out1"/>
+          <output name="out2" num_pins="2" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_16384x2_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_16384x2_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_16384x2_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_16384x2_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[13:0]" output="mem_16384x2_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[13:0]" out_port="mem_16384x2_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[13:0]" output="mem_16384x2_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[13:0]" out_port="mem_16384x2_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[1:0]" output="mem_16384x2_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[1:0]" out_port="mem_16384x2_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[3:2]" output="mem_16384x2_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[3:2]" out_port="mem_16384x2_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_16384x2_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_16384x2_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_16384x2_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_16384x2_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_16384x2_dp.out1" output="memory.out[1:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_16384x2_dp.out1" out_port="memory.out[1:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_16384x2_dp.out2" output="memory.out[3:2]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_16384x2_dp.out2" out_port="memory.out[3:2]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_16384x2_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <mode name="mem_32768x1_dp">
+        <pb_type name="mem_32768x1_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+          <input name="addr1" num_pins="15" port_class="address1"/>
+          <input name="addr2" num_pins="15" port_class="address2"/>
+          <input name="data1" num_pins="1" port_class="data_in1"/>
+          <input name="data2" num_pins="1" port_class="data_in2"/>
+          <input name="we1" num_pins="1" port_class="write_en1"/>
+          <input name="we2" num_pins="1" port_class="write_en2"/>
+          <output name="out1" num_pins="1" port_class="data_out1"/>
+          <output name="out2" num_pins="1" port_class="data_out2"/>
+          <clock name="clk" num_pins="1" port_class="clock"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.addr1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.data1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.we1" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.addr2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.data2" clock="clk"/>
+          <T_setup value="509e-12" port="mem_32768x1_dp.we2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.addr1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.data1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.we1" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.addr2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.data2" clock="clk"/>
+          <T_hold value="238e-12" port="mem_32768x1_dp.we2" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_32768x1_dp.out1" clock="clk"/>
+          <T_clock_to_Q max="1.234e-9" min="1.196e-9" port="mem_32768x1_dp.out2" clock="clk"/>
+          <power method="pin-toggle">
+            <port name="clk" energy_per_toggle="17.9e-12"/>
+            <static_power power_per_instance="0.0"/>
+          </power>
+        </pb_type>
+        <interconnect>
+          <direct name="address1" input="memory.addr1[14:0]" output="mem_32768x1_dp.addr1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr1[14:0]" out_port="mem_32768x1_dp.addr1"/>
+          </direct>
+          <direct name="address2" input="memory.addr2[14:0]" output="mem_32768x1_dp.addr2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.addr2[14:0]" out_port="mem_32768x1_dp.addr2"/>
+          </direct>
+          <direct name="data1" input="memory.data[0:0]" output="mem_32768x1_dp.data1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[0:0]" out_port="mem_32768x1_dp.data1"/>
+          </direct>
+          <direct name="data2" input="memory.data[1:1]" output="mem_32768x1_dp.data2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.data[1:1]" out_port="mem_32768x1_dp.data2"/>
+          </direct>
+          <direct name="writeen1" input="memory.we1" output="mem_32768x1_dp.we1">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we1" out_port="mem_32768x1_dp.we1"/>
+          </direct>
+          <direct name="writeen2" input="memory.we2" output="mem_32768x1_dp.we2">
+            <delay_constant max="132e-12" min="88e-12" in_port="memory.we2" out_port="mem_32768x1_dp.we2"/>
+          </direct>
+          <direct name="dataout1" input="mem_32768x1_dp.out1" output="memory.out[0:0]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_32768x1_dp.out1" out_port="memory.out[0:0]"/>
+          </direct>
+          <direct name="dataout2" input="mem_32768x1_dp.out2" output="memory.out[1:1]">
+            <delay_constant max="50e-12" min="46e-12" in_port="mem_32768x1_dp.out2" out_port="memory.out[1:1]"/>
+          </direct>
+          <direct name="clk" input="memory.clk" output="mem_32768x1_dp.clk">
+          </direct>
+        </interconnect>
+      </mode>
+      <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+      <!-- Place this memory block every 8 columns from (and including) the second column -->
+      <power method="sum-of-children"/>
+    </pb_type>
+    <!-- Define fracturable memory end -->
+  </complexblocklist>
+  <power>
+    <local_interconnect C_wire="2.5e-10"/>
+    <mux_transistor_size mux_transistor_size="3"/>
+    <FF_size FF_size="4"/>
+    <LUT_transistor_size LUT_transistor_size="4"/>
+  </power>
+  <clocks>
+    <clock buffer_size="auto" C_wire="2.5e-10"/>
+  </clocks>
+</architecture>
diff --git a/third_party/vtr_flow/primitives.v b/third_party/vtr_flow/primitives.v
new file mode 100644
index 000000000..678af1ccd
--- /dev/null
+++ b/third_party/vtr_flow/primitives.v
@@ -0,0 +1,329 @@
+`timescale 1ps/1ps
+//Overivew
+//========
+//This file contains the verilog primitives produced by VPR's
+//post-synthesis netlist writer.
+//
+//If you wish to do back-annotated timing simulation you will need
+//to link with this file during simulation.
+//
+//To ensure currect result when performing back-annoatation with 
+//Modelsim see the notes at the end of this comment.
+//
+//Specifying Timing Edges
+//=======================
+//To perform timing back-annotation the simulator must know the delay 
+//dependancies (timing edges) between the ports on each primitive.
+//
+//During back-annotation the simulator will attempt to annotate SDF delay
+//values onto the timing edges.  It should give a warning if was unable
+//to find a matching edge.
+//
+//
+//In Verilog timing edges are specified using a specify block (delimited by the
+//'specify' and 'endspecify' keywords.
+//
+//Inside the specify block a set of specify statements are used to describe
+//the timing edges.  For example consider:
+//
+//  input [1:0] in;
+//  output [1:0] out;
+//  specify
+//      (in[0] => out[0]) = "";
+//      (in[1] => out[1]) = "";
+//  endspecify
+//
+//This states that there are the following timing edges (dependancies):
+//  * from in[0] to out[0]
+//  * from in[1] to out[1]
+//
+//We could (according to the Verilog standard) equivalently have used:
+//
+//  input [1:0] in;
+//  output [1:0] out;
+//  specify
+//      (in => out) = "";
+//  endspecify
+//
+//However NOT ALL SIMULATORS TREAT MULTIBIT SPECIFY STATEMENTS CORRECTLY,
+//at least by default (in particular ModelSim, see notes below).
+//
+//The previous examples use the 'parrallel connection' operator '=>', which
+//creates parallel edges between the two operands (i.e. bit 0 to bit 0, bit
+//1 to bit 1 etc.).  Note that both operands must have the same bit-width. 
+//
+//Verilog also supports the 'full connection' operator '*>' which will create
+//a fully connected set of edges (e.g. from all-to-all). It does not require
+//both operands to have the same bit-width. For example:
+//
+//  input [1:0] in;
+//  output [2:0] out;
+//  specify
+//      (in *> out) = "";
+//  endspecify
+//
+//states that there are the following timing edges (dependancies):
+//  * from in[0] to out[0]
+//  * from in[0] to out[1]
+//  * from in[0] to out[2]
+//  * from in[1] to out[0]
+//  * from in[1] to out[1]
+//  * from in[1] to out[2]
+//
+//For more details on specify blocks see Section 14 "Specify Blocks" of the
+//Verilog standard (IEEE 1364-2005).
+//
+//Back-annotation with Modelsim
+//=============================
+//
+//Ensuring Multi-bit Specifies are Handled Correctly: Bit-blasting
+//----------------------------------------------------------------
+//
+//ModelSim (tested on Modelsim SE 10.4c) ignores multi-bit specify statements
+//by default.
+//
+//This causes SDF annotation errors such as:
+//
+//  vsim-SDF-3261: Failed to find matching specify module path
+//
+//To force Modelsim to correctly interpret multi-bit specify statements you
+//should provide the '+bitblast' option to the vsim executable.
+//This forces it to apply specify statements using multi-bit operands to
+//each bit of the operand (i.e. according to the Verilog standard).
+//
+//Confirming back-annotation is occuring correctly
+//------------------------------------------------
+//
+//Another useful option is '+sdf_verbose' which produces extra output about
+//SDF annotation, which can be used to verify annotation occured correctly.
+//
+//For example:
+//
+//      Summary of Verilog design objects annotated: 
+//      
+//           Module path delays =          5
+//      
+//       ******************************************************************************
+//      
+//       Summary of constructs read: 
+//      
+//                 IOPATH =          5
+//
+//shows that all 5 IOPATH constructs in the SDF were annotated to the verilog
+//design.
+//
+//Example vsim Command Line
+//--------------------------
+//The following is an example command-line to vsim (where 'tb' is the name of your
+//testbench):
+//
+//  vsim -t 1ps -L rtl_work -L work -voptargs="+acc" +sdf_verbose +bitblast tb
+
+
+
+
+//K-input Look-Up Table
+module LUT_K #(
+    //The Look-up Table size (number of inputs)
+    parameter K = 1, 
+
+    //The lut mask.  
+    //Left-most (MSB) bit corresponds to all inputs logic one. 
+    //Defaults to always false.
+    parameter LUT_MASK={2**K{1'b0}} 
+) (
+    input [K-1:0] in,
+    output out
+);
+
+    specify
+        (in *> out) = "";
+    endspecify
+
+    assign out = LUT_MASK[in];
+
+endmodule
+
+//D-FlipFlop module
+module DFF #(
+    parameter INITIAL_VALUE=1'b0    
+) (
+    input clk,
+    input D,
+    output reg Q
+);
+
+    specify
+        (clk => Q) = "";
+        $setup(D, posedge clk, "");
+        $hold(posedge clk, D, "");
+    endspecify
+
+    initial begin
+        Q <= INITIAL_VALUE;
+    end
+
+    always@(posedge clk) begin
+        Q <= D;
+    end
+endmodule
+
+//Routing fpga_interconnect module
+module fpga_interconnect(
+    input datain,
+    output dataout
+);
+
+    specify
+        (datain=>dataout)="";
+    endspecify
+
+    assign dataout = datain;
+
+endmodule
+
+
+//2-to-1 mux module
+module mux(
+    input select,
+    input x,
+    input y,
+    output z
+);
+
+    assign z = (x & ~select) | (y & select);
+
+endmodule
+
+//n-bit adder
+module adder #(
+    parameter WIDTH = 1   
+) (
+    input [WIDTH-1:0] a, 
+    input [WIDTH-1:0] b, 
+    input cin, 
+    output cout, 
+    output [WIDTH-1:0] sumout);
+
+   specify
+      (a*>sumout)="";
+      (b*>sumout)="";
+      (cin*>sumout)="";
+      (a*>cout)="";
+      (b*>cout)="";
+      (cin=>cout)="";
+   endspecify
+   
+   assign {cout, sumout} = a + b + cin;
+   
+endmodule
+   
+//nxn multiplier module
+module multiply #(
+    //The width of input signals
+    parameter WIDTH = 1
+) (
+    input [WIDTH-1:0] a,
+    input [WIDTH-1:0] b,
+    output [2*WIDTH-1:0] out
+);
+
+    specify
+        (a *> out) = "";
+        (b *> out) = "";
+    endspecify
+
+    assign out = a * b;
+
+endmodule // mult
+
+//single_port_ram module
+(* keep_hierarchy *)
+module single_port_ram #(
+    parameter ADDR_WIDTH = 1,
+    parameter DATA_WIDTH = 1
+) (
+    input clk,
+    input [ADDR_WIDTH-1:0] addr,
+    input [DATA_WIDTH-1:0] data,
+    input we,
+    output reg [DATA_WIDTH-1:0] out
+);
+
+    localparam MEM_DEPTH = 2 ** ADDR_WIDTH;
+
+    reg [DATA_WIDTH-1:0] Mem[MEM_DEPTH-1:0];
+
+    specify
+        (clk*>out)="";
+        $setup(addr, posedge clk, "");
+        $setup(data, posedge clk, "");
+        $setup(we, posedge clk, "");
+        $hold(posedge clk, addr, "");
+        $hold(posedge clk, data, "");
+        $hold(posedge clk, we, "");
+    endspecify
+   
+    always@(posedge clk) begin
+        if(we) begin
+            Mem[addr] = data;
+        end
+    	out = Mem[addr]; //New data read-during write behaviour (blocking assignments)
+    end
+   
+endmodule // single_port_RAM
+
+//dual_port_ram module
+(* keep_hierarchy *)
+module dual_port_ram #(
+    parameter ADDR_WIDTH = 1,
+    parameter DATA_WIDTH = 1
+) (
+    input clk,
+
+    input [ADDR_WIDTH-1:0] addr1,
+    input [ADDR_WIDTH-1:0] addr2,
+    input [DATA_WIDTH-1:0] data1,
+    input [DATA_WIDTH-1:0] data2,
+    input we1,
+    input we2,
+    output reg [DATA_WIDTH-1:0] out1,
+    output reg [DATA_WIDTH-1:0] out2
+);
+
+    localparam MEM_DEPTH = 2 ** ADDR_WIDTH;
+
+    reg [DATA_WIDTH-1:0] Mem[MEM_DEPTH-1:0];
+
+    specify
+        (clk*>out1)="";
+        (clk*>out2)="";
+        $setup(addr1, posedge clk, "");
+        $setup(addr2, posedge clk, "");
+        $setup(data1, posedge clk, "");
+        $setup(data2, posedge clk, "");
+        $setup(we1, posedge clk, "");
+        $setup(we2, posedge clk, "");
+        $hold(posedge clk, addr1, "");
+        $hold(posedge clk, addr2, "");
+        $hold(posedge clk, data1, "");
+        $hold(posedge clk, data2, "");
+        $hold(posedge clk, we1, "");
+        $hold(posedge clk, we2, "");
+    endspecify
+   
+    always@(posedge clk) begin //Port 1
+        if(we1) begin
+            Mem[addr1] = data1;
+        end
+        out1 = Mem[addr1]; //New data read-during write behaviour (blocking assignments)
+    end
+
+    always@(posedge clk) begin //Port 2
+        if(we2) begin
+            Mem[addr2] = data2;
+        end
+        out2 = Mem[addr2]; //New data read-during write behaviour (blocking assignments)
+    end
+   
+endmodule // dual_port_ram
diff --git a/third_party/vtr_flow/verilog/eltwise_layer.v b/third_party/vtr_flow/verilog/eltwise_layer.v
new file mode 100644
index 000000000..11199fb90
--- /dev/null
+++ b/third_party/vtr_flow/verilog/eltwise_layer.v
@@ -0,0 +1,3057 @@
+//////////////////////////////////////////////////////////////////////////////
+// Author: Aman Arora
+//////////////////////////////////////////////////////////////////////////////
+
+`timescale 1ns/1ns
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+// Eltwise layer
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+// Precision BF16
+//Each PE has 1 multiplier, an adder and a subtractor.
+//There are 4 PEs in each compute unit. 
+//There are 6 such compute units in the whole layer.
+//So, total compute throughput is 24 ops per cycle.
+//The "per cycle" is because the adder/sub/mul are
+//pipelined. Although they may be take more than 1 cycle,
+//but in the steady state, one result will come out every cycle.
+//
+//There are 6 BRAMs for each input operand. Each location in a BRAM
+//stores 4 inputs. So, the read bandwidth is 24 elements
+//per cycle. This matches the compute throughput. So, we
+//utilize each PE every cycle. There are 6 BRAMs for output.
+//We can write 4 elements per cycle.
+//
+//There are two modes of operation: 
+// 1. Vector/Matrix mode
+//    In this mode, both operands are matrices/vectors.
+//    They are read from BRAMs (A and B). The operation 
+//    selected (using the op input) is performed. This mode
+//    can be used for operations such as residual add, or 
+//    dropout.
+// 2. Scalar mode
+//    In this mode, one operand is a matrix/vector and the
+//    other operand is a scalar. It could be the mean or 
+//    variance of a normalization layer for example. The 
+//    scalar input is provided from the top-level of the design
+//    so it can be easily modified at runtime.
+//
+//Important inputs:
+//   mode: 
+//      0 -> Both operands (A and B) are matrices/vectors. Result is a matrix/vector.
+//      1 -> Operand A is matrix/vector. Operand B is scalar. Result is a matrix/vector.
+//   op:
+//      00 -> Addition
+//      01 -> Subtraction
+//      10 -> Multiplication
+//
+//The whole design can operate on 24xN matrices.  
+//Typically, to use this design, we'd break a large input
+//matrix into 24 column sections and process the matrix 
+//section by section. The number of rows will be programmed
+//in the "iterations" register in the design.
+
+
+`define BFLOAT16 
+
+// IEEE Half Precision => EXPONENT = 5, MANTISSA = 10
+// BFLOAT16 => EXPONENT = 8, MANTISSA = 7 
+
+`ifdef BFLOAT16
+`define EXPONENT 8
+`define MANTISSA 7
+`else // for ieee half precision fp16
+`define EXPONENT 5
+`define MANTISSA 10
+`endif
+
+`define SIGN 1
+`define DWIDTH (`SIGN+`EXPONENT+`MANTISSA)
+
+`define AWIDTH 10
+`define MEM_SIZE 1024
+`define DESIGN_SIZE 12
+`define CU_SIZE 4
+`define MASK_WIDTH 4
+`define MEM_ACCESS_LATENCY 1
+
+`define REG_DATAWIDTH 32
+`define REG_ADDRWIDTH 8
+`define ITERATIONS_WIDTH 32
+
+`define REG_STDN_ADDR 32'h4
+`define REG_MATRIX_A_ADDR 32'he
+`define REG_MATRIX_B_ADDR 32'h12
+`define REG_MATRIX_C_ADDR 32'h16
+`define REG_VALID_MASK_A_ADDR 32'h20
+`define REG_VALID_MASK_B_ADDR 32'h5c
+
+`define REG_ITERATIONS_ADDR 32'h40
+
+//This is the pipeline depth of the PEs (adder/mult)
+`define PE_PIPELINE_DEPTH 5
+
+module eltwise_layer(
+  input clk,
+  input clk_mem,
+  input resetn,
+  input pe_resetn,
+  input        [`REG_ADDRWIDTH-1:0] PADDR,
+  input                             PWRITE,
+  input                             PSEL,
+  input                             PENABLE,
+  input        [`REG_DATAWIDTH-1:0] PWDATA,
+  output reg   [`REG_DATAWIDTH-1:0] PRDATA,
+  output reg                        PREADY,
+  input [`DWIDTH-1:0] scalar_inp,
+  input mode, // mode==0 -> vector/matrix, mode==1 -> scalar
+  input  [1:0] op, //op==11 -> Mul, op==01 -> Sub, op==00 -> Add
+  input  [7:0] bram_select,
+  input  [`AWIDTH-1:0] bram_addr_ext,
+  output reg [`CU_SIZE*`DWIDTH-1:0] bram_rdata_ext,
+  input  [`CU_SIZE*`DWIDTH-1:0] bram_wdata_ext,
+  input  [`CU_SIZE-1:0] bram_we_ext
+);
+
+
+  wire PCLK;
+  assign PCLK = clk;
+  wire PRESETn;
+  assign PRESETn = resetn;
+  reg start_reg;
+  reg clear_done_reg;
+
+  //Dummy register to sync all other invalid/unimplemented addresses
+  reg [`REG_DATAWIDTH-1:0] reg_dummy;
+  
+  reg [`AWIDTH-1:0] bram_addr_a_0_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_0_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_0_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_0_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_a_2_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_2_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_2_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_2_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_a_4_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_4_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_4_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_4_ext;
+
+  reg [`AWIDTH-1:0] bram_addr_a_1_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_1_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_1_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_1_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_a_3_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_3_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_3_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_3_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_a_5_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_5_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_5_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_a_5_ext;
+
+    
+  reg [`AWIDTH-1:0] bram_addr_b_0_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_0_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_0_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_0_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_b_1_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_1_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_1_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_1_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_b_2_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_2_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_2_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_2_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_b_3_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_3_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_3_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_3_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_b_4_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_4_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_4_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_4_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_b_5_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_5_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_5_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_b_5_ext;
+
+  reg [`AWIDTH-1:0] bram_addr_c_0_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_0_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_0_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_0_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_c_1_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_1_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_1_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_1_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_c_2_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_2_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_2_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_2_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_c_3_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_3_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_3_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_3_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_c_4_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_4_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_4_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_4_ext;
+    
+  reg [`AWIDTH-1:0] bram_addr_c_5_ext;
+  wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_5_ext;
+  reg [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_5_ext;
+  reg [`MASK_WIDTH-1:0] bram_we_c_5_ext;
+    
+	wire [`AWIDTH-1:0] bram_addr_a_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_0;
+	wire [`MASK_WIDTH-1:0] bram_we_a_0;
+	wire bram_en_a_0;
+    
+	wire [`AWIDTH-1:0] bram_addr_a_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_2;
+	wire [`MASK_WIDTH-1:0] bram_we_a_2;
+	wire bram_en_a_2;
+    
+	wire [`AWIDTH-1:0] bram_addr_a_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_4;
+	wire [`MASK_WIDTH-1:0] bram_we_a_4;
+	wire bram_en_a_4;
+
+	wire [`AWIDTH-1:0] bram_addr_a_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_1;
+	wire [`MASK_WIDTH-1:0] bram_we_a_1;
+	wire bram_en_a_1;
+    
+	wire [`AWIDTH-1:0] bram_addr_a_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_3;
+	wire [`MASK_WIDTH-1:0] bram_we_a_3;
+	wire bram_en_a_3;
+    
+	wire [`AWIDTH-1:0] bram_addr_a_5;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_a_5;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_a_5;
+	wire [`MASK_WIDTH-1:0] bram_we_a_5;
+	wire bram_en_a_5;
+    
+	wire [`AWIDTH-1:0] bram_addr_b_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_0;
+	wire [`MASK_WIDTH-1:0] bram_we_b_0;
+	wire bram_en_b_0;
+    
+	wire [`AWIDTH-1:0] bram_addr_b_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_1;
+	wire [`MASK_WIDTH-1:0] bram_we_b_1;
+	wire bram_en_b_1;
+    
+	wire [`AWIDTH-1:0] bram_addr_b_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_2;
+	wire [`MASK_WIDTH-1:0] bram_we_b_2;
+	wire bram_en_b_2;
+    
+	wire [`AWIDTH-1:0] bram_addr_b_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_3;
+	wire [`MASK_WIDTH-1:0] bram_we_b_3;
+	wire bram_en_b_3;
+
+  wire [`AWIDTH-1:0] bram_addr_b_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_4;
+	wire [`MASK_WIDTH-1:0] bram_we_b_4;
+	wire bram_en_b_4;
+    
+	wire [`AWIDTH-1:0] bram_addr_b_5;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_b_5;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_b_5;
+	wire [`MASK_WIDTH-1:0] bram_we_b_5;
+	wire bram_en_b_5;
+
+	wire [`AWIDTH-1:0] bram_addr_c_0;
+	wire [`AWIDTH-1:0] bram_addr_c_1;
+	wire [`AWIDTH-1:0] bram_addr_c_2;
+	wire [`AWIDTH-1:0] bram_addr_c_3;
+	wire [`AWIDTH-1:0] bram_addr_c_4;
+	wire [`AWIDTH-1:0] bram_addr_c_5;
+
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_wdata_c_5;
+
+	wire [`MASK_WIDTH-1:0] bram_we_c_0;
+	wire [`MASK_WIDTH-1:0] bram_we_c_1;
+	wire [`MASK_WIDTH-1:0] bram_we_c_2;
+	wire [`MASK_WIDTH-1:0] bram_we_c_3;
+	wire [`MASK_WIDTH-1:0] bram_we_c_4;
+	wire [`MASK_WIDTH-1:0] bram_we_c_5;
+    
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] bram_rdata_c_5;
+
+  always @ (posedge clk) begin
+    case (bram_select)
+  
+      0: begin
+      bram_addr_a_0_ext <= bram_addr_ext;
+      bram_wdata_a_0_ext <= bram_wdata_ext;
+      bram_we_a_0_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_0_ext;
+      end
+    
+      1: begin
+      bram_addr_a_2_ext <= bram_addr_ext;
+      bram_wdata_a_2_ext <= bram_wdata_ext;
+      bram_we_a_2_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_2_ext;
+      end
+    
+      2: begin
+      bram_addr_a_4_ext <= bram_addr_ext;
+      bram_wdata_a_4_ext <= bram_wdata_ext;
+      bram_we_a_4_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_4_ext;
+      end
+
+      3: begin
+      bram_addr_a_1_ext <= bram_addr_ext;
+      bram_wdata_a_1_ext <= bram_wdata_ext;
+      bram_we_a_1_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_1_ext;
+      end
+    
+      4: begin
+      bram_addr_a_3_ext <= bram_addr_ext;
+      bram_wdata_a_3_ext <= bram_wdata_ext;
+      bram_we_a_3_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_3_ext;
+      end
+    
+      5: begin
+      bram_addr_a_5_ext <= bram_addr_ext;
+      bram_wdata_a_5_ext <= bram_wdata_ext;
+      bram_we_a_5_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_a_5_ext;
+      end
+    
+      6: begin
+      bram_addr_b_0_ext = bram_addr_ext;
+      bram_wdata_b_0_ext = bram_wdata_ext;
+      bram_we_b_0_ext = bram_we_ext;
+      bram_rdata_ext = bram_rdata_b_0_ext;
+      end
+    
+      7: begin
+      bram_addr_b_1_ext <= bram_addr_ext;
+      bram_wdata_b_1_ext <= bram_wdata_ext;
+      bram_we_b_1_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_b_1_ext;
+      end
+    
+      8: begin
+      bram_addr_b_2_ext <= bram_addr_ext;
+      bram_wdata_b_2_ext <= bram_wdata_ext;
+      bram_we_b_2_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_b_2_ext;
+      end
+    
+      9: begin
+      bram_addr_b_3_ext <= bram_addr_ext;
+      bram_wdata_b_3_ext <= bram_wdata_ext;
+      bram_we_b_3_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_b_3_ext;
+      end
+    
+      10: begin
+      bram_addr_b_4_ext <= bram_addr_ext;
+      bram_wdata_b_4_ext <= bram_wdata_ext;
+      bram_we_b_4_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_b_4_ext;
+      end
+    
+      11: begin
+      bram_addr_b_5_ext <= bram_addr_ext;
+      bram_wdata_b_5_ext <= bram_wdata_ext;
+      bram_we_b_5_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_b_5_ext;
+      end
+
+      12: begin
+      bram_addr_c_0_ext <= bram_addr_ext;
+      bram_wdata_c_0_ext <= bram_wdata_ext;
+      bram_we_c_0_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_0_ext;
+      end
+    
+      13: begin
+      bram_addr_c_1_ext <= bram_addr_ext;
+      bram_wdata_c_1_ext <= bram_wdata_ext;
+      bram_we_c_1_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_1_ext;
+      end
+    
+      14: begin
+      bram_addr_c_2_ext <= bram_addr_ext;
+      bram_wdata_c_2_ext <= bram_wdata_ext;
+      bram_we_c_2_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_2_ext;
+      end
+    
+      15: begin
+      bram_addr_c_3_ext <= bram_addr_ext;
+      bram_wdata_c_3_ext <= bram_wdata_ext;
+      bram_we_c_3_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_3_ext;
+      end
+    
+      16: begin
+      bram_addr_c_4_ext <= bram_addr_ext;
+      bram_wdata_c_4_ext <= bram_wdata_ext;
+      bram_we_c_4_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_4_ext;
+      end
+    
+      17: begin
+      bram_addr_c_5_ext <= bram_addr_ext;
+      bram_wdata_c_5_ext <= bram_wdata_ext;
+      bram_we_c_5_ext <= bram_we_ext;
+      bram_rdata_ext <= bram_rdata_c_5_ext;
+      end
+    
+      default: begin
+		bram_addr_c_5_ext <= bram_addr_ext;
+      bram_wdata_c_5_ext <= bram_wdata_ext;
+      bram_we_c_5_ext <= bram_we_ext;
+      bram_rdata_ext <= 0;
+      end
+    endcase 
+  end
+  
+/////////////////////////////////////////////////
+// BRAMs to store matrix A
+/////////////////////////////////////////////////
+
+
+  // BRAM matrix A 0
+ram matrix_A_0(
+  .addr0(bram_addr_a_0),
+  .d0(bram_wdata_a_0), 
+  .we0(bram_we_a_0), 
+  .q0(bram_rdata_a_0), 
+  .addr1(bram_addr_a_0_ext),
+  .d1(bram_wdata_a_0_ext), 
+  .we1(bram_we_a_0_ext), 
+  .q1(bram_rdata_a_0_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix A 2
+ram matrix_A_2(
+  .addr0(bram_addr_a_2),
+  .d0(bram_wdata_a_2), 
+  .we0(bram_we_a_2), 
+  .q0(bram_rdata_a_2), 
+  .addr1(bram_addr_a_2_ext),
+  .d1(bram_wdata_a_2_ext), 
+  .we1(bram_we_a_2_ext), 
+  .q1(bram_rdata_a_2_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix A 4
+ram matrix_A_4(
+  .addr0(bram_addr_a_4),
+  .d0(bram_wdata_a_4), 
+  .we0(bram_we_a_4), 
+  .q0(bram_rdata_a_4), 
+  .addr1(bram_addr_a_4_ext),
+  .d1(bram_wdata_a_4_ext), 
+  .we1(bram_we_a_4_ext), 
+  .q1(bram_rdata_a_4_ext), 
+  .clk(clk_mem));
+
+
+    // BRAM matrix A 1
+ram matrix_A_1(
+  .addr0(bram_addr_a_1),
+  .d0(bram_wdata_a_1), 
+  .we0(bram_we_a_1), 
+  .q0(bram_rdata_a_1), 
+  .addr1(bram_addr_a_1_ext),
+  .d1(bram_wdata_a_1_ext), 
+  .we1(bram_we_a_1_ext), 
+  .q1(bram_rdata_a_1_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix A 3
+ram matrix_A_3(
+  .addr0(bram_addr_a_3),
+  .d0(bram_wdata_a_3), 
+  .we0(bram_we_a_3), 
+  .q0(bram_rdata_a_3), 
+  .addr1(bram_addr_a_3_ext),
+  .d1(bram_wdata_a_3_ext), 
+  .we1(bram_we_a_3_ext), 
+  .q1(bram_rdata_a_3_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix A 5
+ram matrix_A_5(
+  .addr0(bram_addr_a_5),
+  .d0(bram_wdata_a_5), 
+  .we0(bram_we_a_5), 
+  .q0(bram_rdata_a_5), 
+  .addr1(bram_addr_a_5_ext),
+  .d1(bram_wdata_a_5_ext), 
+  .we1(bram_we_a_5_ext), 
+  .q1(bram_rdata_a_5_ext), 
+  .clk(clk_mem));
+
+////////////////////////////////////////////////
+// BRAMs to store matrix B
+/////////////////////////////////////////////////
+
+
+  // BRAM matrix B 0
+ram matrix_B_0(
+  .addr0(bram_addr_b_0),
+  .d0(bram_wdata_b_0), 
+  .we0(bram_we_b_0), 
+  .q0(bram_rdata_b_0), 
+  .addr1(bram_addr_b_0_ext),
+  .d1(bram_wdata_b_0_ext), 
+  .we1(bram_we_b_0_ext), 
+  .q1(bram_rdata_b_0_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix B 1
+ram matrix_B_1(
+  .addr0(bram_addr_b_1),
+  .d0(bram_wdata_b_1), 
+  .we0(bram_we_b_1), 
+  .q0(bram_rdata_b_1), 
+  .addr1(bram_addr_b_1_ext),
+  .d1(bram_wdata_b_1_ext), 
+  .we1(bram_we_b_1_ext), 
+  .q1(bram_rdata_b_1_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix B 2
+ram matrix_B_2(
+  .addr0(bram_addr_b_2),
+  .d0(bram_wdata_b_2), 
+  .we0(bram_we_b_2), 
+  .q0(bram_rdata_b_2), 
+  .addr1(bram_addr_b_2_ext),
+  .d1(bram_wdata_b_2_ext), 
+  .we1(bram_we_b_2_ext), 
+  .q1(bram_rdata_b_2_ext), 
+  .clk(clk_mem));
+
+  	
+  // BRAM matrix B 3
+ram matrix_B_3(
+  .addr0(bram_addr_b_3),
+  .d0(bram_wdata_b_3), 
+  .we0(bram_we_b_3), 
+  .q0(bram_rdata_b_3), 
+  .addr1(bram_addr_b_3_ext),
+  .d1(bram_wdata_b_3_ext), 
+  .we1(bram_we_b_3_ext), 
+  .q1(bram_rdata_b_3_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix B 4
+ram matrix_B_4(
+  .addr0(bram_addr_b_4),
+  .d0(bram_wdata_b_4), 
+  .we0(bram_we_b_4), 
+  .q0(bram_rdata_b_4), 
+  .addr1(bram_addr_b_4_ext),
+  .d1(bram_wdata_b_4_ext), 
+  .we1(bram_we_b_4_ext), 
+  .q1(bram_rdata_b_4_ext), 
+  .clk(clk_mem));
+
+
+  // BRAM matrix B 5
+ram matrix_B_5(
+  .addr0(bram_addr_b_5),
+  .d0(bram_wdata_b_5), 
+  .we0(bram_we_b_5), 
+  .q0(bram_rdata_b_5), 
+  .addr1(bram_addr_b_5_ext),
+  .d1(bram_wdata_b_5_ext), 
+  .we1(bram_we_b_5_ext), 
+  .q1(bram_rdata_b_5_ext), 
+  .clk(clk_mem));
+
+////////////////////////////////////////////////
+// BRAMs to store matrix C
+/////////////////////////////////////////////////
+
+
+  // BRAM matrix C 0
+ram matrix_C_0(
+  .addr0(bram_addr_c_0),
+  .d0(bram_wdata_c_0), 
+  .we0(bram_we_c_0), 
+  .q0(bram_rdata_c_0), 
+  .addr1(bram_addr_c_0_ext),
+  .d1(bram_wdata_c_0_ext), 
+  .we1(bram_we_c_0_ext), 
+  .q1(bram_rdata_c_0_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix C 1
+ram matrix_C_1(
+  .addr0(bram_addr_c_1),
+  .d0(bram_wdata_c_1), 
+  .we0(bram_we_c_1), 
+  .q0(bram_rdata_c_1), 
+  .addr1(bram_addr_c_1_ext),
+  .d1(bram_wdata_c_1_ext), 
+  .we1(bram_we_c_1_ext), 
+  .q1(bram_rdata_c_1_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix C 2
+ram matrix_C_2(
+  .addr0(bram_addr_c_2),
+  .d0(bram_wdata_c_2), 
+  .we0(bram_we_c_2), 
+  .q0(bram_rdata_c_2), 
+  .addr1(bram_addr_c_2_ext),
+  .d1(bram_wdata_c_2_ext), 
+  .we1(bram_we_c_2_ext), 
+  .q1(bram_rdata_c_2_ext), 
+  .clk(clk_mem));
+
+  	
+  // BRAM matrix C 3
+ram matrix_C_3(
+  .addr0(bram_addr_c_3),
+  .d0(bram_wdata_c_3), 
+  .we0(bram_we_c_3), 
+  .q0(bram_rdata_c_3), 
+  .addr1(bram_addr_c_3_ext),
+  .d1(bram_wdata_c_3_ext), 
+  .we1(bram_we_c_3_ext), 
+  .q1(bram_rdata_c_3_ext), 
+  .clk(clk_mem));
+  	
+  // BRAM matrix C 4
+ram matrix_C_4(
+  .addr0(bram_addr_c_4),
+  .d0(bram_wdata_c_4), 
+  .we0(bram_we_c_4), 
+  .q0(bram_rdata_c_4), 
+  .addr1(bram_addr_c_4_ext),
+  .d1(bram_wdata_c_4_ext), 
+  .we1(bram_we_c_4_ext), 
+  .q1(bram_rdata_c_4_ext), 
+  .clk(clk_mem));
+
+
+  // BRAM matrix C 5
+ram matrix_C_5(
+  .addr0(bram_addr_c_5),
+  .d0(bram_wdata_c_5), 
+  .we0(bram_we_c_5), 
+  .q0(bram_rdata_c_5), 
+  .addr1(bram_addr_c_5_ext),
+  .d1(bram_wdata_c_5_ext), 
+  .we1(bram_we_c_5_ext), 
+  .q1(bram_rdata_c_5_ext), 
+  .clk(clk_mem));
+  	
+reg start_eltwise_op;
+wire done_eltwise_op;
+
+reg [3:0] state;
+	
+////////////////////////////////////////////////////////////////
+// Control logic
+////////////////////////////////////////////////////////////////
+	always @( posedge clk) begin
+      if (resetn == 1'b0) begin
+        state <= 4'b0000;
+        start_eltwise_op <= 1'b0;
+      end 
+      else begin
+        case (state)
+
+        4'b0000: begin
+          start_eltwise_op <= 1'b0;
+          if (start_reg == 1'b1) begin
+            state <= 4'b0001;
+          end else begin
+            state <= 4'b0000;
+          end
+        end
+        
+        4'b0001: begin
+          start_eltwise_op <= 1'b1;	      
+          state <= 4'b1010;                    
+        end      
+        
+        4'b1010: begin                 
+          if (done_eltwise_op == 1'b1) begin
+            start_eltwise_op <= 1'b0;
+            state <= 4'b1000;
+          end
+          else begin
+            state <= 4'b1010;
+          end
+        end
+
+       4'b1000: begin
+         if (clear_done_reg == 1'b1) begin
+           state <= 4'b0000;
+         end
+         else begin
+           state <= 4'b1000;
+         end
+       end
+      endcase  
+	end 
+  end
+
+reg [1:0] state_apb;
+`define IDLE     2'b00
+`define W_ENABLE  2'b01
+`define R_ENABLE  2'b10
+
+reg [`AWIDTH-1:0] address_mat_a;
+reg [`AWIDTH-1:0] address_mat_b;
+reg [`AWIDTH-1:0] address_mat_c;
+reg [`MASK_WIDTH-1:0] validity_mask_a;
+reg [`MASK_WIDTH-1:0] validity_mask_b;
+reg [`ITERATIONS_WIDTH-1:0] iterations;
+
+////////////////////////////////////////////////////////////////
+// Configuration logic
+////////////////////////////////////////////////////////////////
+always @(posedge PCLK) begin
+  if (PRESETn == 0) begin
+    state_apb <= `IDLE;
+    PRDATA <= 0;
+    PREADY <= 0;
+    address_mat_a <= 0;
+    address_mat_b <= 0;
+    address_mat_c <= 0;
+    validity_mask_a <= {`MASK_WIDTH{1'b1}};
+    validity_mask_b <= {`MASK_WIDTH{1'b1}};
+  end
+
+  else begin
+    case (state_apb)
+      `IDLE : begin
+        PRDATA <= 0;
+        if (PSEL) begin
+          if (PWRITE) begin
+            state_apb <= `W_ENABLE;
+          end
+          else begin
+            state_apb <= `R_ENABLE;
+          end
+        end
+        PREADY <= 0;
+      end
+
+      `W_ENABLE : begin
+        if (PSEL && PWRITE && PENABLE) begin
+          case (PADDR)
+          `REG_STDN_ADDR       : begin
+                                 start_reg <= PWDATA[0];
+                                 clear_done_reg <= PWDATA[31];
+                                 end
+          `REG_MATRIX_A_ADDR   : address_mat_a <= PWDATA[`AWIDTH-1:0];
+          `REG_MATRIX_B_ADDR   : address_mat_b <= PWDATA[`AWIDTH-1:0];
+          `REG_MATRIX_C_ADDR   : address_mat_c <= PWDATA[`AWIDTH-1:0];
+          `REG_VALID_MASK_A_ADDR: begin
+                                validity_mask_a <= PWDATA[`MASK_WIDTH-1:0];
+                                end
+          `REG_VALID_MASK_B_ADDR: begin
+                                validity_mask_b <= PWDATA[`MASK_WIDTH-1:0];
+                                end
+          `REG_ITERATIONS_ADDR: iterations <= PWDATA[`ITERATIONS_WIDTH-1:0];
+          default : reg_dummy <= PWDATA; //sink writes to a dummy register
+          endcase
+          PREADY <=1;          
+        end
+        state_apb <= `IDLE;
+      end
+
+      `R_ENABLE : begin
+        if (PSEL && !PWRITE && PENABLE) begin
+          PREADY <= 1;
+          case (PADDR)
+          `REG_STDN_ADDR        : PRDATA <= {done_eltwise_op, 30'b0, start_eltwise_op};
+          `REG_MATRIX_A_ADDR    : PRDATA <= address_mat_a;
+          `REG_MATRIX_B_ADDR    : PRDATA <= address_mat_b;
+          `REG_MATRIX_C_ADDR    : PRDATA <= address_mat_c;
+          `REG_VALID_MASK_A_ADDR: PRDATA <= validity_mask_a;
+          `REG_VALID_MASK_B_ADDR: PRDATA <= validity_mask_b;
+          `REG_ITERATIONS_ADDR: PRDATA <= iterations;
+          default : PRDATA <= reg_dummy; //read the dummy register for undefined addresses
+          endcase
+        end
+        state_apb <= `IDLE;
+      end
+      default: begin
+        state_apb <= `IDLE;
+      end
+    endcase
+  end
+end  
+  
+wire reset;
+assign reset = ~resetn;
+wire pe_reset;
+assign pe_reset = ~pe_resetn;
+
+  wire c_data_0_available;
+  wire c_data_1_available;
+  wire c_data_2_available;
+  wire c_data_3_available;
+  wire c_data_4_available;
+  wire c_data_5_available;
+
+  assign bram_wdata_a_0 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_0 = 1'b1;
+  assign bram_we_a_0 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_a_1 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_1 = 1'b1;
+  assign bram_we_a_1 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_a_2 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_2 = 1'b1;
+  assign bram_we_a_2 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_a_3 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_3 = 1'b1;
+  assign bram_we_a_3 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_a_4 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_4 = 1'b1;
+  assign bram_we_a_4 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_a_5 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_a_5 = 1'b1;
+  assign bram_we_a_5 = {`MASK_WIDTH{1'b0}};
+  	
+  assign bram_wdata_b_0 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_0 = 1'b1;
+  assign bram_we_b_0 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_b_1 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_1 = 1'b1;
+  assign bram_we_b_1 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_b_2 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_2 = 1'b1;
+  assign bram_we_b_2 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_b_3 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_3 = 1'b1;
+  assign bram_we_b_3 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_b_4 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_4 = 1'b1;
+  assign bram_we_b_4 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_wdata_b_5 = {`CU_SIZE*`DWIDTH{1'b0}};
+  assign bram_en_b_5 = 1'b1;
+  assign bram_we_b_5 = {`MASK_WIDTH{1'b0}};
+
+  assign bram_we_c_0 = (c_data_0_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+  assign bram_we_c_2 = (c_data_2_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+  assign bram_we_c_4 = (c_data_4_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+  assign bram_we_c_1 = (c_data_1_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+  assign bram_we_c_3 = (c_data_3_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+  assign bram_we_c_5 = (c_data_5_available) ? {`MASK_WIDTH{1'b1}} : {`MASK_WIDTH{1'b0}};  
+
+  /////////////////////////////////////////////////
+  // ORing all done signals
+  /////////////////////////////////////////////////
+  wire done_eltwise_op_0;
+  wire done_eltwise_op_1;
+  wire done_eltwise_op_2;
+  wire done_eltwise_op_3;
+  wire done_eltwise_op_4;
+  wire done_eltwise_op_5;
+
+  assign done_eltwise_op = 
+  done_eltwise_op_0 | 
+  done_eltwise_op_1 | 
+  done_eltwise_op_2 | 
+  done_eltwise_op_3 | 
+  done_eltwise_op_4 | 
+  done_eltwise_op_5 ;
+
+  /////////////////////////////////////////////////
+  // Code to allow for scalar mode
+  /////////////////////////////////////////////////
+  
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_0;
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_1;
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_2;
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_3;
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_4;
+	wire [`CU_SIZE*`DWIDTH-1:0] b_data_5;
+
+  assign b_data_0 = mode ? bram_rdata_b_0 : {`CU_SIZE{scalar_inp}};
+  assign b_data_1 = mode ? bram_rdata_b_1 : {`CU_SIZE{scalar_inp}};
+  assign b_data_2 = mode ? bram_rdata_b_2 : {`CU_SIZE{scalar_inp}};
+  assign b_data_3 = mode ? bram_rdata_b_3 : {`CU_SIZE{scalar_inp}};
+  assign b_data_4 = mode ? bram_rdata_b_4 : {`CU_SIZE{scalar_inp}};
+  assign b_data_5 = mode ? bram_rdata_b_5 : {`CU_SIZE{scalar_inp}};
+
+  /////////////////////////////////////////////////
+  // Compute Unit 0
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_0(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_0),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_0),
+  .b_data(b_data_0),
+  .c_data_out(bram_wdata_c_0),
+  .a_addr(bram_addr_a_0),
+  .b_addr(bram_addr_b_0),
+  .c_addr(bram_addr_c_0),
+  .c_data_available(c_data_0_available),
+  .validity_mask_a(4'b1111),
+  .validity_mask_b(4'b1111)
+);
+
+  /////////////////////////////////////////////////
+  // Compute Unit 1
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_1(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_1),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_1),
+  .b_data(b_data_1),
+  .c_data_out(bram_wdata_c_1),
+  .a_addr(bram_addr_a_1),
+  .b_addr(bram_addr_b_1),
+  .c_addr(bram_addr_c_1),
+  .c_data_available(c_data_1_available),
+  .validity_mask_a(4'b1111),
+  .validity_mask_b(4'b1111)
+);
+
+  /////////////////////////////////////////////////
+  // Compute Unit 2
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_2(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_2),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_2),
+  .b_data(b_data_2),
+  .c_data_out(bram_wdata_c_2),
+  .a_addr(bram_addr_a_2),
+  .b_addr(bram_addr_b_2),
+  .c_addr(bram_addr_c_2),
+  .c_data_available(c_data_2_available),
+  .validity_mask_a(4'b1111),
+  .validity_mask_b(4'b1111)
+);
+
+  /////////////////////////////////////////////////
+  // Compute Unit 3
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_3(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_3),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_3),
+  .b_data(b_data_3),
+  .c_data_out(bram_wdata_c_3),
+  .a_addr(bram_addr_a_3),
+  .b_addr(bram_addr_b_3),
+  .c_addr(bram_addr_c_3),
+  .c_data_available(c_data_3_available),
+  .validity_mask_a(4'b1111),
+  .validity_mask_b(4'b1111)
+);
+
+  /////////////////////////////////////////////////
+  // Compute Unit 4
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_4(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_4),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_4),
+  .b_data(b_data_4),
+  .c_data_out(bram_wdata_c_4),
+  .a_addr(bram_addr_a_4),
+  .b_addr(bram_addr_b_4),
+  .c_addr(bram_addr_c_4),
+  .c_data_available(c_data_4_available),
+  .validity_mask_a(4'b1111),
+  .validity_mask_b(4'b1111)
+);
+
+  /////////////////////////////////////////////////
+  // Compute Unit 5
+  /////////////////////////////////////////////////
+
+eltwise_cu u_eltwise_cu_5(
+  .clk(clk),
+  .reset(reset),
+  .pe_reset(pe_reset),
+  .start_eltwise_op(start_eltwise_op),
+  .done_eltwise_op(done_eltwise_op_5),
+  .count(iterations),
+  .op(op),
+  .address_mat_a(address_mat_a),
+  .address_mat_b(address_mat_b),
+  .address_mat_c(address_mat_c),
+  .a_data(bram_rdata_a_5),
+  .b_data(b_data_5),
+  .c_data_out(bram_wdata_c_5),
+  .a_addr(bram_addr_a_5),
+  .b_addr(bram_addr_b_5),
+  .c_addr(bram_addr_c_5),
+  .c_data_available(c_data_5_available),
+  .validity_mask_a(4'b0011),
+  .validity_mask_b(4'b0011)
+);
+
+endmodule
+
+
+//////////////////////////////////
+//////////////////////////////////
+//Dual port RAM
+//////////////////////////////////
+//////////////////////////////////
+module ram (
+        addr0, 
+        d0, 
+        we0, 
+        q0,  
+        addr1,
+        d1,
+        we1,
+        q1,
+        clk);
+
+input [`AWIDTH-1:0] addr0;
+input [`AWIDTH-1:0] addr1;
+input [`CU_SIZE*`DWIDTH-1:0] d0;
+input [`CU_SIZE*`DWIDTH-1:0] d1;
+input [`CU_SIZE-1:0] we0;
+input [`CU_SIZE-1:0] we1;
+output [`CU_SIZE*`DWIDTH-1:0] q0;
+output [`CU_SIZE*`DWIDTH-1:0] q1;
+input clk;
+
+genvar i; 
+
+generate
+`ifdef QUARTUS
+   for (i=0;i<`CU_SIZE;i=i+1) begin: gen_dpram
+`else
+   for (i=0;i<`CU_SIZE;i=i+1) begin
+`endif
+     dpram_original #(.AWIDTH(`AWIDTH),.DWIDTH(`DWIDTH),.NUM_WORDS(1<<`AWIDTH)) dp1 (.clk(clk),.address_a(addr0),.address_b(addr1),.wren_a(we0[i]),.wren_b(we1[i]),.data_a(d0[i*`DWIDTH +: `DWIDTH]),.data_b(d1[i*`DWIDTH +: `DWIDTH]),.out_a(q0[i*`DWIDTH +: `DWIDTH]),.out_b(q1[i*`DWIDTH +: `DWIDTH]));
+   end
+endgenerate
+
+endmodule
+
+module dpram_original (
+    clk,
+    address_a,
+    address_b,
+    wren_a,
+    wren_b,
+    data_a,
+    data_b,
+    out_a,
+    out_b
+);
+parameter AWIDTH=10;
+parameter NUM_WORDS=1024;
+parameter DWIDTH=32;
+input clk;
+input [(AWIDTH-1):0] address_a;
+input [(AWIDTH-1):0] address_b;
+input  wren_a;
+input  wren_b;
+input [(DWIDTH-1):0] data_a;
+input [(DWIDTH-1):0] data_b;
+output reg [(DWIDTH-1):0] out_a;
+output reg [(DWIDTH-1):0] out_b;
+
+`ifndef hard_mem
+
+reg [DWIDTH-1:0] ram[NUM_WORDS-1:0];
+always @ (posedge clk) begin 
+  if (wren_a) begin
+      ram[address_a] <= data_a;
+  end
+  out_a <= ram[address_a];
+end
+  
+always @ (posedge clk) begin 
+  if (wren_b) begin
+      ram[address_b] <= data_b;
+  end 
+  out_b <= ram[address_b];
+end
+
+`else
+
+defparam u_dual_port_ram.ADDR_WIDTH = AWIDTH;
+defparam u_dual_port_ram.DATA_WIDTH = DWIDTH;
+
+dual_port_ram u_dual_port_ram(
+.addr1(address_a),
+.we1(wren_a),
+.data1(data_a),
+.out1(out_a),
+.addr2(address_b),
+.we2(wren_b),
+.data2(data_b),
+.out2(out_b),
+.clk(clk)
+);
+
+`endif
+endmodule
+
+  
+//////////////////////////////////
+//////////////////////////////////
+// Elementwise compute unit
+//////////////////////////////////
+//////////////////////////////////
+module eltwise_cu(
+ clk,
+ reset,
+ pe_reset,
+ start_eltwise_op,
+ done_eltwise_op,
+ count,
+ op,
+ address_mat_a,
+ address_mat_b,
+ address_mat_c,
+ a_data,
+ b_data,
+ c_data_out, 
+ a_addr,
+ b_addr,
+ c_addr,
+ c_data_available,
+ validity_mask_a,
+ validity_mask_b
+);
+
+ input clk;
+ input reset;
+ input pe_reset;
+ input start_eltwise_op;
+ output done_eltwise_op;
+ input [`ITERATIONS_WIDTH-1:0] count;
+ input [1:0] op;
+ input [`AWIDTH-1:0] address_mat_a;
+ input [`AWIDTH-1:0] address_mat_b;
+ input [`AWIDTH-1:0] address_mat_c;
+ input [`CU_SIZE*`DWIDTH-1:0] a_data;
+ input [`CU_SIZE*`DWIDTH-1:0] b_data;
+ output [`CU_SIZE*`DWIDTH-1:0] c_data_out;
+ output [`AWIDTH-1:0] a_addr;
+ output [`AWIDTH-1:0] b_addr;
+ output [`AWIDTH-1:0] c_addr;
+ output c_data_available;
+ input [`MASK_WIDTH-1:0] validity_mask_a;
+ input [`MASK_WIDTH-1:0] validity_mask_b;
+
+wire [`DWIDTH-1:0] out0;
+wire [`DWIDTH-1:0] out1;
+wire [`DWIDTH-1:0] out2;
+wire [`DWIDTH-1:0] out3;
+
+wire [`DWIDTH-1:0] a0_data;
+wire [`DWIDTH-1:0] a1_data;
+wire [`DWIDTH-1:0] a2_data;
+wire [`DWIDTH-1:0] a3_data;
+wire [`DWIDTH-1:0] b0_data;
+wire [`DWIDTH-1:0] b1_data;
+wire [`DWIDTH-1:0] b2_data;
+wire [`DWIDTH-1:0] b3_data;
+
+//////////////////////////////////////////////////////////////////////////
+// Logic for done
+//////////////////////////////////////////////////////////////////////////
+wire [7:0] clk_cnt_for_done;
+reg [31:0] clk_cnt;
+reg done_eltwise_op;
+
+assign clk_cnt_for_done = 
+                  `PE_PIPELINE_DEPTH + //This is dependent on the pipeline depth of the PEs
+                  count //The number of iterations asked for this compute unit
+                  ;
+                          
+always @(posedge clk) begin
+  if (reset || ~start_eltwise_op) begin
+    clk_cnt <= 0;
+    done_eltwise_op <= 0;
+  end
+  else if (clk_cnt == clk_cnt_for_done) begin
+    done_eltwise_op <= 1;
+    clk_cnt <= clk_cnt + 1;
+  end
+  else if (done_eltwise_op == 0) begin
+    clk_cnt <= clk_cnt + 1;
+  end    
+  else begin
+    done_eltwise_op <= 0;
+    clk_cnt <= clk_cnt + 1;
+  end
+end
+
+//////////////////////////////////////////////////////////////////////////
+// Instantiation of input logic
+//////////////////////////////////////////////////////////////////////////
+input_logic u_input_logic(
+.clk(clk),
+.reset(reset),
+.start_eltwise_op(start_eltwise_op),
+.count(count),
+.a_addr(a_addr),
+.b_addr(b_addr),
+.address_mat_a(address_mat_a),
+.address_mat_b(address_mat_b),
+.a_data(a_data),
+.b_data(b_data),
+.a0_data(a0_data),
+.a1_data(a1_data),
+.a2_data(a2_data),
+.a3_data(a3_data),
+.b0_data(b0_data),
+.b1_data(b1_data),
+.b2_data(b2_data),
+.b3_data(b3_data),
+.validity_mask_a(validity_mask_a),
+.validity_mask_b(validity_mask_b)
+);
+
+//////////////////////////////////////////////////////////////////////////
+// Instantiation of the output logic
+//////////////////////////////////////////////////////////////////////////
+output_logic u_output_logic(
+.clk(clk),
+.reset(reset),
+.start_eltwise_op(start_eltwise_op),
+.done_eltwise_op(done_eltwise_op),
+.address_mat_c(address_mat_c),
+.c_data_out(c_data_out),
+.c_addr(c_addr),
+.c_data_available(c_data_available),
+.out0(out0),
+.out1(out1),
+.out2(out2),
+.out3(out3)
+);
+
+//////////////////////////////////////////////////////////////////////////
+// Instantiations of the actual PEs
+//////////////////////////////////////////////////////////////////////////
+pe_array u_pe_array(
+.reset(reset),
+.clk(clk),
+.pe_reset(pe_reset),
+.op(op),
+.a0(a0_data), 
+.a1(a1_data), 
+.a2(a2_data), 
+.a3(a3_data),
+.b0(b0_data), 
+.b1(b1_data), 
+.b2(b2_data), 
+.b3(b3_data),
+.out0(out0),
+.out1(out1),
+.out2(out2),
+.out3(out3)
+);
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////
+// Output logic
+//////////////////////////////////////////////////////////////////////////
+module output_logic(
+clk,
+reset,
+start_eltwise_op,
+done_eltwise_op,
+address_mat_c,
+c_data_out, 
+c_addr,
+c_data_available,
+out0,
+out1,
+out2,
+out3
+);
+
+input clk;
+input reset;
+input start_eltwise_op;
+input done_eltwise_op;
+input [`AWIDTH-1:0] address_mat_c;
+output [`CU_SIZE*`DWIDTH-1:0] c_data_out;
+output [`AWIDTH-1:0] c_addr;
+output c_data_available;
+input [`DWIDTH-1:0] out0;
+input [`DWIDTH-1:0] out1;
+input [`DWIDTH-1:0] out2;
+input [`DWIDTH-1:0] out3;
+
+reg c_data_available;
+reg [`CU_SIZE*`DWIDTH-1:0] c_data_out;
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to capture matrix C data from the PEs and send to RAM
+//////////////////////////////////////////////////////////////////////////
+
+reg [`AWIDTH-1:0] c_addr;
+reg [7:0] cnt;
+
+always @(posedge clk) begin
+  if (reset | ~start_eltwise_op) begin
+    c_data_available <= 1'b0;
+    c_addr <= address_mat_c;
+    c_data_out <= 0;
+    cnt <= 0;
+  end
+  else if (cnt>`PE_PIPELINE_DEPTH) begin
+    c_data_available <= 1'b1;
+    c_addr <= c_addr+1;
+    c_data_out <= {out3, out2, out1, out0};
+    cnt <= cnt + 1;
+  end else begin
+    cnt <= cnt + 1;
+  end 
+end
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////
+// Data setup
+//////////////////////////////////////////////////////////////////////////
+module input_logic(
+clk,
+reset,
+start_eltwise_op,
+count,
+a_addr,
+b_addr,
+address_mat_a,
+address_mat_b,
+a_data,
+b_data,
+a0_data,
+a1_data,
+a2_data,
+a3_data,
+b0_data,
+b1_data,
+b2_data,
+b3_data,
+validity_mask_a,
+validity_mask_b
+);
+
+input clk;
+input reset;
+input start_eltwise_op;
+input [`ITERATIONS_WIDTH-1:0] count;
+output [`AWIDTH-1:0] a_addr;
+output [`AWIDTH-1:0] b_addr;
+input [`AWIDTH-1:0] address_mat_a;
+input [`AWIDTH-1:0] address_mat_b;
+input [`CU_SIZE*`DWIDTH-1:0] a_data;
+input [`CU_SIZE*`DWIDTH-1:0] b_data;
+output [`DWIDTH-1:0] a0_data;
+output [`DWIDTH-1:0] a1_data;
+output [`DWIDTH-1:0] a2_data;
+output [`DWIDTH-1:0] a3_data;
+output [`DWIDTH-1:0] b0_data;
+output [`DWIDTH-1:0] b1_data;
+output [`DWIDTH-1:0] b2_data;
+output [`DWIDTH-1:0] b3_data;
+input [`MASK_WIDTH-1:0] validity_mask_a;
+input [`MASK_WIDTH-1:0] validity_mask_b;
+
+reg [7:0] iterations;
+
+wire [`DWIDTH-1:0] a0_data;
+wire [`DWIDTH-1:0] a1_data;
+wire [`DWIDTH-1:0] a2_data;
+wire [`DWIDTH-1:0] a3_data;
+wire [`DWIDTH-1:0] b0_data;
+wire [`DWIDTH-1:0] b1_data;
+wire [`DWIDTH-1:0] b2_data;
+wire [`DWIDTH-1:0] b3_data;
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to generate addresses to BRAM A
+//////////////////////////////////////////////////////////////////////////
+reg [`AWIDTH-1:0] a_addr;
+reg a_mem_access; //flag that tells whether the compute unit is trying to access memory or not
+
+always @(posedge clk) begin
+  //else if (clk_cnt >= a_loc*`CU_SIZE+final_mat_mul_size) begin
+  //Writing the line above to avoid multiplication:
+  if (reset || ~start_eltwise_op) begin
+    a_addr <= address_mat_a;
+    a_mem_access <= 0;
+    iterations <= 0;
+  end
+
+  //else if ((clk_cnt >= a_loc*`CU_SIZE) && (clk_cnt < a_loc*`CU_SIZE+final_mat_mul_size)) begin
+  //Writing the line above to avoid multiplication:
+  else if (iterations <= count) begin
+    a_addr <= a_addr + 1;
+    a_mem_access <= 1;
+    iterations <= iterations + 1;
+  end
+end  
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to generate valid signals for data coming from BRAM A
+//////////////////////////////////////////////////////////////////////////
+reg [7:0] a_mem_access_counter;
+always @(posedge clk) begin
+  if (reset || ~start_eltwise_op) begin
+    a_mem_access_counter <= 0;
+  end
+  else if (a_mem_access == 1) begin
+    a_mem_access_counter <= a_mem_access_counter + 1;  
+
+  end
+  else begin
+    a_mem_access_counter <= 0;
+  end
+end
+
+wire bram_rdata_a_valid; //flag that tells whether the data from memory is valid
+assign bram_rdata_a_valid = 
+       ((validity_mask_a[0]==1'b0 && a_mem_access_counter==1) ||
+        (validity_mask_a[1]==1'b0 && a_mem_access_counter==2) ||
+        (validity_mask_a[2]==1'b0 && a_mem_access_counter==3) ||
+        (validity_mask_a[3]==1'b0 && a_mem_access_counter==4)) ?
+        1'b0 : (a_mem_access_counter >= `MEM_ACCESS_LATENCY);
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to delay certain parts of the data received from BRAM A (systolic data setup)
+//////////////////////////////////////////////////////////////////////////
+//Slice data into chunks and qualify it with whether it is valid or not
+assign a0_data = a_data[1*`DWIDTH-1:0*`DWIDTH] & {`DWIDTH{bram_rdata_a_valid}} & {`DWIDTH{validity_mask_a[0]}};
+assign a1_data = a_data[2*`DWIDTH-1:1*`DWIDTH] & {`DWIDTH{bram_rdata_a_valid}} & {`DWIDTH{validity_mask_a[1]}};
+assign a2_data = a_data[3*`DWIDTH-1:2*`DWIDTH] & {`DWIDTH{bram_rdata_a_valid}} & {`DWIDTH{validity_mask_a[2]}};
+assign a3_data = a_data[4*`DWIDTH-1:3*`DWIDTH] & {`DWIDTH{bram_rdata_a_valid}} & {`DWIDTH{validity_mask_a[3]}};
+
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to generate addresses to BRAM B
+//////////////////////////////////////////////////////////////////////////
+reg [`AWIDTH-1:0] b_addr;
+reg b_mem_access; //flag that tells whether the compute unit is trying to access memory or not
+
+always @(posedge clk) begin
+  //else if (clk_cnt >= b_loc*`CU_SIZE+final_mat_mul_size) begin
+  //Writing the line above to avoid multiplication:
+  if (reset || ~start_eltwise_op) begin
+    b_addr <= address_mat_b ;
+    b_mem_access <= 0;
+  end
+  //else if ((clk_cnt >= b_loc*`CU_SIZE) && (clk_cnt < b_loc*`CU_SIZE+final_mat_mul_size)) begin
+  //Writing the line above to avoid multiplication:
+  else if (iterations <= count) begin
+    b_addr <= b_addr + 1;
+    b_mem_access <= 1;
+  end
+end  
+
+//////////////////////////////////////////////////////////////////////////
+// Logic to generate valid signals for data coming from BRAM B
+//////////////////////////////////////////////////////////////////////////
+reg [7:0] b_mem_access_counter;
+always @(posedge clk) begin
+  if (reset || ~start_eltwise_op) begin
+    b_mem_access_counter <= 0;
+  end
+  else if (b_mem_access == 1) begin
+    b_mem_access_counter <= b_mem_access_counter + 1;  
+  end
+  else begin
+    b_mem_access_counter <= 0;
+  end
+end
+
+wire bram_rdata_b_valid; //flag that tells whether the data from memory is valid
+assign bram_rdata_b_valid = 
+       ((validity_mask_b[0]==1'b0 && b_mem_access_counter==1) ||
+        (validity_mask_b[1]==1'b0 && b_mem_access_counter==2) ||
+        (validity_mask_b[2]==1'b0 && b_mem_access_counter==3) ||
+        (validity_mask_b[3]==1'b0 && b_mem_access_counter==4)) ?
+        1'b0 : (b_mem_access_counter >= `MEM_ACCESS_LATENCY);
+
+//Slice data into chunks and qualify it with whether it is valid or not
+assign b0_data = b_data[1*`DWIDTH-1:0*`DWIDTH] & {`DWIDTH{bram_rdata_b_valid}} & {`DWIDTH{validity_mask_b[0]}};
+assign b1_data = b_data[2*`DWIDTH-1:1*`DWIDTH] & {`DWIDTH{bram_rdata_b_valid}} & {`DWIDTH{validity_mask_b[1]}};
+assign b2_data = b_data[3*`DWIDTH-1:2*`DWIDTH] & {`DWIDTH{bram_rdata_b_valid}} & {`DWIDTH{validity_mask_b[2]}};
+assign b3_data = b_data[4*`DWIDTH-1:3*`DWIDTH] & {`DWIDTH{bram_rdata_b_valid}} & {`DWIDTH{validity_mask_b[3]}};
+
+
+endmodule
+
+
+
+//////////////////////////////////////////////////////////////////////////
+// Array of processing elements
+//////////////////////////////////////////////////////////////////////////
+module pe_array(
+reset,
+clk,
+pe_reset,
+op,
+a0, a1, a2, a3,
+b0, b1, b2, b3,
+out0, out1, out2, out3
+);
+
+input clk;
+input reset;
+input pe_reset;
+input [1:0] op;
+input [`DWIDTH-1:0] a0;
+input [`DWIDTH-1:0] a1;
+input [`DWIDTH-1:0] a2;
+input [`DWIDTH-1:0] a3;
+input [`DWIDTH-1:0] b0;
+input [`DWIDTH-1:0] b1;
+input [`DWIDTH-1:0] b2;
+input [`DWIDTH-1:0] b3;
+output [`DWIDTH-1:0] out0;
+output [`DWIDTH-1:0] out1;
+output [`DWIDTH-1:0] out2;
+output [`DWIDTH-1:0] out3;
+
+wire [`DWIDTH-1:0] out0, out1, out2, out3;
+
+wire effective_rst;
+assign effective_rst = reset | pe_reset;
+
+processing_element pe0(.reset(effective_rst), .clk(clk), .in_a(a0), .in_b(b0), .op(op), .out(out0));
+processing_element pe1(.reset(effective_rst), .clk(clk), .in_a(a1), .in_b(b1), .op(op), .out(out1));
+processing_element pe2(.reset(effective_rst), .clk(clk), .in_a(a2), .in_b(b2), .op(op), .out(out2));
+processing_element pe3(.reset(effective_rst), .clk(clk), .in_a(a3), .in_b(b3), .op(op), .out(out3));
+
+endmodule
+
+
+//////////////////////////////////////////////////////////////////////////
+// Processing element (PE)
+//////////////////////////////////////////////////////////////////////////
+module processing_element(
+ reset, 
+ clk, 
+ in_a,
+ in_b, 
+ op,
+ out
+ );
+
+ input reset;
+ input clk;
+ input  [`DWIDTH-1:0] in_a;
+ input  [`DWIDTH-1:0] in_b;
+ input  [1:0] op;
+ output [`DWIDTH-1:0] out;
+
+ wire [`DWIDTH-1:0] out_mul;
+ wire [`DWIDTH-1:0] out_sum;
+ wire [`DWIDTH-1:0] out_sub;
+
+ assign out = (op == 2'b00) ? out_sum : 
+              (op == 2'b01) ? out_sub :
+              out_mul;
+
+ seq_mul u_mul(.a(in_a), .b(in_b), .out(out_mul), .reset(reset), .clk(clk));
+ seq_add u_add(.a(in_a), .b(in_b), .out(out_sum), .reset(reset), .clk(clk));
+ seq_sub u_sub(.a(in_a), .b(in_b), .out(out_sub), .reset(reset), .clk(clk));
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+// Multiply block
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+module seq_mul(a, b, out, reset, clk);
+input [`DWIDTH-1:0] a;
+input [`DWIDTH-1:0] b;
+input reset;
+input clk;
+output [`DWIDTH-1:0] out;
+
+reg [`DWIDTH-1:0] a_flopped;
+reg [`DWIDTH-1:0] b_flopped;
+
+wire [`DWIDTH-1:0] mul_out_temp;
+reg [`DWIDTH-1:0] mul_out_temp_reg;
+
+always @(posedge clk) begin
+  if (reset) begin
+    a_flopped <= 0;
+    b_flopped <= 0;
+  end else begin
+    a_flopped <= a;
+    b_flopped <= b;
+  end
+end
+
+//assign mul_out_temp = a * b;
+`ifdef complex_dsp
+mult_fp_clk_16 mul_u1(.clk(clk), .a(a_flopped), .b(b_flopped), .out(mul_out_temp));
+`else
+FPMult_16 u_FPMult (.clk(clk), .rst(1'b0), .a(a_flopped), .b(b_flopped), .result(mul_out_temp), .flags());
+`endif
+
+always @(posedge clk) begin
+  if (reset) begin
+    mul_out_temp_reg <= 0;
+  end else begin
+    mul_out_temp_reg <= mul_out_temp;
+  end
+end
+
+assign out = mul_out_temp_reg;
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+// Addition block
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+module seq_add(a, b, out, reset, clk);
+input [`DWIDTH-1:0] a;
+input [`DWIDTH-1:0] b;
+input reset;
+input clk;
+output [`DWIDTH-1:0] out;
+
+reg [`DWIDTH-1:0] a_flopped;
+reg [`DWIDTH-1:0] b_flopped;
+
+wire [`DWIDTH-1:0] sum_out_temp;
+reg [`DWIDTH-1:0] sum_out_temp_reg;
+
+always @(posedge clk) begin
+  if (reset) begin
+    a_flopped <= 0;
+    b_flopped <= 0;
+  end else begin
+    a_flopped <= a;
+    b_flopped <= b;
+  end
+end
+
+//assign sum_out_temp = a + b;
+`ifdef complex_dsp
+addition_fp_clk_16 add_u1(.clk(clk), .a(a_flopped), .b(b_flopped), .out(sum_out_temp));
+`else
+FPAddSub u_FPAddSub (.clk(clk), .rst(1'b0), .a(a_flopped), .b(b_flopped), .operation(1'b0), .result(sum_out_temp), .flags());
+`endif
+
+always @(posedge clk) begin
+  if (reset) begin
+    sum_out_temp_reg <= 0;
+  end else begin
+    sum_out_temp_reg <= sum_out_temp;
+  end
+end
+
+assign out = sum_out_temp_reg;
+
+endmodule
+
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+// Subtraction block
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+module seq_sub(a, b, out, reset, clk);
+input [`DWIDTH-1:0] a;
+input [`DWIDTH-1:0] b;
+input reset;
+input clk;
+output [`DWIDTH-1:0] out;
+
+reg [`DWIDTH-1:0] a_flopped;
+reg [`DWIDTH-1:0] b_flopped;
+
+wire [`DWIDTH-1:0] sub_out_temp;
+reg [`DWIDTH-1:0] sub_out_temp_reg;
+
+always @(posedge clk) begin
+  if (reset) begin
+    a_flopped <= 0;
+    b_flopped <= 0;
+  end else begin
+    a_flopped <= a;
+    b_flopped <= b;
+  end
+end
+
+//assign sub_out_temp = a - b;
+//Floating point adder has both modes - add and sub.
+//We don't provide the name of the mode here though.
+
+`ifdef complex_dsp
+addition_fp_clk_16 sub_u1(.clk(clk), .a(a_flopped), .b(b_flopped), .out(sub_out_temp));
+`else
+FPAddSub u_FPAddSub2(.clk(clk), .rst(1'b0), .a(a_flopped), .b(b_flopped), .operation(1'b0), .result(sub_out_temp), .flags());
+`endif
+
+always @(posedge clk) begin
+  if (reset) begin
+    sub_out_temp_reg <= 0;
+  end else begin
+    sub_out_temp_reg <= sub_out_temp;
+  end
+end
+
+assign out = sub_out_temp_reg;
+
+endmodule
+
+
+`ifndef complex_dsp
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+// Floating point 16-bit multiplier
+// This is a heavily modified version of:
+// https://github.com/fbrosser/DSP48E1-FP/tree/master/src/FPMult
+// Original author: Fredrik Brosser
+// Abridged by: Samidh Mehta
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+module FPMult_16(
+		clk,
+		rst,
+		a,
+		b,
+		result,
+		flags
+    );
+	
+	// Input Ports
+	input clk ;							// Clock
+	input rst ;							// Reset signal
+	input [`DWIDTH-1:0] a;						// Input A, a 32-bit floating point number
+	input [`DWIDTH-1:0] b;						// Input B, a 32-bit floating point number
+	
+	// Output ports
+	output [`DWIDTH-1:0] result ;					// Product, result of the operation, 32-bit FP number
+	output [4:0] flags ;						// Flags indicating exceptions according to IEEE754
+	
+	// Internal signals
+	wire [`DWIDTH-1:0] Z_int ;					// Product, result of the operation, 32-bit FP number
+	wire [4:0] Flags_int ;						// Flags indicating exceptions according to IEEE754
+	
+	wire Sa ;							// A's sign
+	wire Sb ;							// B's sign
+	wire Sp ;							// Product sign
+	wire [`EXPONENT-1:0] Ea ;					// A's exponent
+	wire [`EXPONENT-1:0] Eb ;					// B's exponent
+	wire [2*`MANTISSA+1:0] Mp ;					// Product mantissa
+	wire [4:0] InputExc ;						// Exceptions in inputs
+	wire [`MANTISSA-1:0] NormM ;					// Normalized mantissa
+	wire [`EXPONENT:0] NormE ;					// Normalized exponent
+	wire [`MANTISSA:0] RoundM ;					// Normalized mantissa
+	wire [`EXPONENT:0] RoundE ;					// Normalized exponent
+	wire [`MANTISSA:0] RoundMP ;					// Normalized mantissa
+	wire [`EXPONENT:0] RoundEP ;					// Normalized exponent
+	wire GRS ;
+
+	//reg [63:0] pipe_0;						// Pipeline register Input->Prep
+	reg [2*`DWIDTH-1:0] pipe_0;					// Pipeline register Input->Prep
+
+	//reg [92:0] pipe_1;						// Pipeline register Prep->Execute
+	//reg [3*`MANTISSA+2*`EXPONENT+7:0] pipe_1;			// Pipeline register Prep->Execute
+	reg [3*`MANTISSA+2*`EXPONENT+18:0] pipe_1;
+
+	//reg [38:0] pipe_2;						// Pipeline register Execute->Normalize
+	reg [`MANTISSA+`EXPONENT+7:0] pipe_2;				// Pipeline register Execute->Normalize
+	
+	//reg [72:0] pipe_3;						// Pipeline register Normalize->Round
+	reg [2*`MANTISSA+2*`EXPONENT+10:0] pipe_3;			// Pipeline register Normalize->Round
+
+	//reg [36:0] pipe_4;						// Pipeline register Round->Output
+	reg [`DWIDTH+4:0] pipe_4;					// Pipeline register Round->Output
+	
+	assign result = pipe_4[`DWIDTH+4:5] ;
+	assign flags = pipe_4[4:0] ;
+	
+	// Prepare the operands for alignment and check for exceptions
+	FPMult_PrepModule PrepModule(clk, rst, pipe_0[2*`DWIDTH-1:`DWIDTH], pipe_0[`DWIDTH-1:0], Sa, Sb, Ea[`EXPONENT-1:0], Eb[`EXPONENT-1:0], Mp[2*`MANTISSA+1:0], InputExc[4:0]) ;
+
+	// Perform (unsigned) mantissa multiplication
+	FPMult_ExecuteModule ExecuteModule(pipe_1[3*`MANTISSA+`EXPONENT*2+7:2*`MANTISSA+2*`EXPONENT+8], pipe_1[2*`MANTISSA+2*`EXPONENT+7:2*`MANTISSA+7], pipe_1[2*`MANTISSA+6:5], pipe_1[2*`MANTISSA+2*`EXPONENT+6:2*`MANTISSA+`EXPONENT+7], pipe_1[2*`MANTISSA+`EXPONENT+6:2*`MANTISSA+7], pipe_1[2*`MANTISSA+2*`EXPONENT+8], pipe_1[2*`MANTISSA+2*`EXPONENT+7], Sp, NormE[`EXPONENT:0], NormM[`MANTISSA-1:0], GRS) ;
+
+	// Round result and if necessary, perform a second (post-rounding) normalization step
+	FPMult_NormalizeModule NormalizeModule(pipe_2[`MANTISSA-1:0], pipe_2[`MANTISSA+`EXPONENT:`MANTISSA], RoundE[`EXPONENT:0], RoundEP[`EXPONENT:0], RoundM[`MANTISSA:0], RoundMP[`MANTISSA:0]) ;		
+
+	// Round result and if necessary, perform a second (post-rounding) normalization step
+	//FPMult_RoundModule RoundModule(pipe_3[47:24], pipe_3[23:0], pipe_3[65:57], pipe_3[56:48], pipe_3[66], pipe_3[67], pipe_3[72:68], Z_int[31:0], Flags_int[4:0]) ;		
+	FPMult_RoundModule RoundModule(pipe_3[2*`MANTISSA+1:`MANTISSA+1], pipe_3[`MANTISSA:0], pipe_3[2*`MANTISSA+2*`EXPONENT+3:2*`MANTISSA+`EXPONENT+3], pipe_3[2*`MANTISSA+`EXPONENT+2:2*`MANTISSA+2], pipe_3[2*`MANTISSA+2*`EXPONENT+4], pipe_3[2*`MANTISSA+2*`EXPONENT+5], pipe_3[2*`MANTISSA+2*`EXPONENT+10:2*`MANTISSA+2*`EXPONENT+6], Z_int[`DWIDTH-1:0], Flags_int[4:0]) ;		
+
+//adding always@ (*) instead of posedge clock to make design combinational
+	always @ (posedge clk) begin	
+		if(rst) begin
+			pipe_0 <= 0;
+			pipe_1 <= 0;
+			pipe_2 <= 0; 
+			pipe_3 <= 0;
+			pipe_4 <= 0;
+		end 
+		else begin		
+			/* PIPE 0
+				[2*`DWIDTH-1:`DWIDTH] A
+				[`DWIDTH-1:0] B
+			*/
+                       pipe_0 <= {a, b} ;
+
+
+			/* PIPE 1
+				[2*`EXPONENT+3*`MANTISSA + 18: 2*`EXPONENT+2*`MANTISSA + 18] //pipe_0[`DWIDTH+`MANTISSA-1:`DWIDTH] , mantissa of A
+				[2*`EXPONENT+2*`MANTISSA + 17 :2*`EXPONENT+2*`MANTISSA + 9] // pipe_0[8:0]
+				[2*`EXPONENT+2*`MANTISSA + 8] Sa
+				[2*`EXPONENT+2*`MANTISSA + 7] Sb
+				[2*`EXPONENT+2*`MANTISSA + 6:`EXPONENT+2*`MANTISSA+7] Ea
+				[`EXPONENT +2*`MANTISSA+6:2*`MANTISSA+7] Eb
+				[2*`MANTISSA+1+5:5] Mp
+				[4:0] InputExc
+			*/
+			//pipe_1 <= {pipe_0[`DWIDTH+`MANTISSA-1:`DWIDTH], pipe_0[`MANTISSA_MUL_SPLIT_LSB-1:0], Sa, Sb, Ea[`EXPONENT-1:0], Eb[`EXPONENT-1:0], Mp[2*`MANTISSA-1:0], InputExc[4:0]} ;
+			pipe_1 <= {pipe_0[`DWIDTH+`MANTISSA-1:`DWIDTH], pipe_0[8:0], Sa, Sb, Ea[`EXPONENT-1:0], Eb[`EXPONENT-1:0], Mp[2*`MANTISSA+1:0], InputExc[4:0]} ;
+			
+			/* PIPE 2
+				[`EXPONENT + `MANTISSA + 7:`EXPONENT + `MANTISSA + 3] InputExc
+				[`EXPONENT + `MANTISSA + 2] GRS
+				[`EXPONENT + `MANTISSA + 1] Sp
+				[`EXPONENT + `MANTISSA:`MANTISSA] NormE
+				[`MANTISSA-1:0] NormM
+			*/
+			pipe_2 <= {pipe_1[4:0], GRS, Sp, NormE[`EXPONENT:0], NormM[`MANTISSA-1:0]} ;
+			/* PIPE 3
+				[2*`EXPONENT+2*`MANTISSA+10:2*`EXPONENT+2*`MANTISSA+6] InputExc
+				[2*`EXPONENT+2*`MANTISSA+5] GRS
+				[2*`EXPONENT+2*`MANTISSA+4] Sp	
+				[2*`EXPONENT+2*`MANTISSA+3:`EXPONENT+2*`MANTISSA+3] RoundE
+				[`EXPONENT+2*`MANTISSA+2:2*`MANTISSA+2] RoundEP
+				[2*`MANTISSA+1:`MANTISSA+1] RoundM
+				[`MANTISSA:0] RoundMP
+			*/
+			pipe_3 <= {pipe_2[`EXPONENT+`MANTISSA+7:`EXPONENT+`MANTISSA+1], RoundE[`EXPONENT:0], RoundEP[`EXPONENT:0], RoundM[`MANTISSA:0], RoundMP[`MANTISSA:0]} ;
+			/* PIPE 4
+				[`DWIDTH+4:5] Z
+				[4:0] Flags
+			*/				
+			pipe_4 <= {Z_int[`DWIDTH-1:0], Flags_int[4:0]} ;
+		end
+	end
+		
+endmodule
+
+
+
+module FPMult_PrepModule (
+		clk,
+		rst,
+		a,
+		b,
+		Sa,
+		Sb,
+		Ea,
+		Eb,
+		Mp,
+		InputExc
+	);
+	
+	// Input ports
+	input clk ;
+	input rst ;
+	input [`DWIDTH-1:0] a ;								// Input A, a 32-bit floating point number
+	input [`DWIDTH-1:0] b ;								// Input B, a 32-bit floating point number
+	
+	// Output ports
+	output Sa ;										// A's sign
+	output Sb ;										// B's sign
+	output [`EXPONENT-1:0] Ea ;								// A's exponent
+	output [`EXPONENT-1:0] Eb ;								// B's exponent
+	output [2*`MANTISSA+1:0] Mp ;							// Mantissa product
+	output [4:0] InputExc ;						// Input numbers are exceptions
+	
+	// Internal signals							// If signal is high...
+	wire ANaN ;										// A is a signalling NaN
+	wire BNaN ;										// B is a signalling NaN
+	wire AInf ;										// A is infinity
+	wire BInf ;										// B is infinity
+    wire [`MANTISSA-1:0] Ma;
+    wire [`MANTISSA-1:0] Mb;
+	
+	assign ANaN = &(a[`DWIDTH-2:`MANTISSA]) &  |(a[`DWIDTH-2:`MANTISSA]) ;			// All one exponent and not all zero mantissa - NaN
+	assign BNaN = &(b[`DWIDTH-2:`MANTISSA]) &  |(b[`MANTISSA-1:0]);			// All one exponent and not all zero mantissa - NaN
+	assign AInf = &(a[`DWIDTH-2:`MANTISSA]) & ~|(a[`DWIDTH-2:`MANTISSA]) ;		// All one exponent and all zero mantissa - Infinity
+	assign BInf = &(b[`DWIDTH-2:`MANTISSA]) & ~|(b[`DWIDTH-2:`MANTISSA]) ;		// All one exponent and all zero mantissa - Infinity
+	
+	// Check for any exceptions and put all flags into exception vector
+	assign InputExc = {(ANaN | BNaN | AInf | BInf), ANaN, BNaN, AInf, BInf} ;
+	//assign InputExc = {(ANaN | ANaN | BNaN |BNaN), ANaN, ANaN, BNaN,BNaN} ;
+	
+	// Take input numbers apart
+	assign Sa = a[`DWIDTH-1] ;							// A's sign
+	assign Sb = b[`DWIDTH-1] ;							// B's sign
+	assign Ea = a[`DWIDTH-2:`MANTISSA];						// Store A's exponent in Ea, unless A is an exception
+	assign Eb = b[`DWIDTH-2:`MANTISSA];						// Store B's exponent in Eb, unless B is an exception	
+//    assign Ma = a[`MANTISSA_MSB:`MANTISSA_LSB];
+  //  assign Mb = b[`MANTISSA_MSB:`MANTISSA_LSB];
+	
+
+
+	//assign Mp = ({4'b0001, a[`MANTISSA-1:0]}*{4'b0001, b[`MANTISSA-1:9]}) ;
+	assign Mp = ({1'b1,a[`MANTISSA-1:0]}*{1'b1, b[`MANTISSA-1:0]}) ;
+
+	
+    //We multiply part of the mantissa here
+    //Full mantissa of A
+    //Bits MANTISSA_MUL_SPLIT_MSB:MANTISSA_MUL_SPLIT_LSB of B
+   // wire [`ACTUAL_MANTISSA-1:0] inp_A;
+   // wire [`ACTUAL_MANTISSA-1:0] inp_B;
+   // assign inp_A = {1'b1, Ma};
+   // assign inp_B = {{(`MANTISSA-(`MANTISSA_MUL_SPLIT_MSB-`MANTISSA_MUL_SPLIT_LSB+1)){1'b0}}, 1'b1, Mb[`MANTISSA_MUL_SPLIT_MSB:`MANTISSA_MUL_SPLIT_LSB]};
+   // DW02_mult #(`ACTUAL_MANTISSA,`ACTUAL_MANTISSA) u_mult(.A(inp_A), .B(inp_B), .TC(1'b0), .PRODUCT(Mp));
+endmodule
+
+
+module FPMult_ExecuteModule(
+		a,
+		b,
+		MpC,
+		Ea,
+		Eb,
+		Sa,
+		Sb,
+		Sp,
+		NormE,
+		NormM,
+		GRS
+    );
+
+	// Input ports
+	input [`MANTISSA-1:0] a ;
+	input [2*`EXPONENT:0] b ;
+	input [2*`MANTISSA+1:0] MpC ;
+	input [`EXPONENT-1:0] Ea ;						// A's exponent
+	input [`EXPONENT-1:0] Eb ;						// B's exponent
+	input Sa ;								// A's sign
+	input Sb ;								// B's sign
+	
+	// Output ports
+	output Sp ;								// Product sign
+	output [`EXPONENT:0] NormE ;													// Normalized exponent
+	output [`MANTISSA-1:0] NormM ;												// Normalized mantissa
+	output GRS ;
+	
+	wire [2*`MANTISSA+1:0] Mp ;
+	
+	assign Sp = (Sa ^ Sb) ;												// Equal signs give a positive product
+	
+   // wire [`ACTUAL_MANTISSA-1:0] inp_a;
+   // wire [`ACTUAL_MANTISSA-1:0] inp_b;
+   // assign inp_a = {1'b1, a};
+   // assign inp_b = {{(`MANTISSA-`MANTISSA_MUL_SPLIT_LSB){1'b0}}, 1'b0, b};
+   // DW02_mult #(`ACTUAL_MANTISSA,`ACTUAL_MANTISSA) u_mult(.A(inp_a), .B(inp_b), .TC(1'b0), .PRODUCT(Mp_temp));
+   // DW01_add #(2*`ACTUAL_MANTISSA) u_add(.A(Mp_temp), .B(MpC<<`MANTISSA_MUL_SPLIT_LSB), .CI(1'b0), .SUM(Mp), .CO());
+
+	//assign Mp = (MpC<<(2*`EXPONENT+1)) + ({4'b0001, a[`MANTISSA-1:0]}*{1'b0, b[2*`EXPONENT:0]}) ;
+	assign Mp = MpC;
+
+
+	assign NormM = (Mp[2*`MANTISSA+1] ? Mp[2*`MANTISSA:`MANTISSA+1] : Mp[2*`MANTISSA-1:`MANTISSA]); 	// Check for overflow
+	assign NormE = (Ea + Eb + Mp[2*`MANTISSA+1]);								// If so, increment exponent
+	
+	assign GRS = ((Mp[`MANTISSA]&(Mp[`MANTISSA+1]))|(|Mp[`MANTISSA-1:0])) ;
+	
+endmodule
+
+module FPMult_NormalizeModule(
+		NormM,
+		NormE,
+		RoundE,
+		RoundEP,
+		RoundM,
+		RoundMP
+    );
+
+	// Input Ports
+	input [`MANTISSA-1:0] NormM ;									// Normalized mantissa
+	input [`EXPONENT:0] NormE ;									// Normalized exponent
+
+	// Output Ports
+	output [`EXPONENT:0] RoundE ;
+	output [`EXPONENT:0] RoundEP ;
+	output [`MANTISSA:0] RoundM ;
+	output [`MANTISSA:0] RoundMP ; 
+	
+// EXPONENT = 5 
+// EXPONENT -1 = 4
+// NEED to subtract 2^4 -1 = 15
+
+wire [`EXPONENT-1 : 0] bias;
+
+assign bias =  ((1<< (`EXPONENT -1)) -1);
+
+	assign RoundE = NormE - bias ;
+	assign RoundEP = NormE - bias -1 ;
+	assign RoundM = NormM ;
+	assign RoundMP = NormM ;
+
+endmodule
+
+module FPMult_RoundModule(
+		RoundM,
+		RoundMP,
+		RoundE,
+		RoundEP,
+		Sp,
+		GRS,
+		InputExc,
+		Z,
+		Flags
+    );
+
+	// Input Ports
+	input [`MANTISSA:0] RoundM ;									// Normalized mantissa
+	input [`MANTISSA:0] RoundMP ;									// Normalized exponent
+	input [`EXPONENT:0] RoundE ;									// Normalized mantissa + 1
+	input [`EXPONENT:0] RoundEP ;									// Normalized exponent + 1
+	input Sp ;												// Product sign
+	input GRS ;
+	input [4:0] InputExc ;
+	
+	// Output Ports
+	output [`DWIDTH-1:0] Z ;										// Final product
+	output [4:0] Flags ;
+	
+	// Internal Signals
+	wire [`EXPONENT:0] FinalE ;									// Rounded exponent
+	wire [`MANTISSA:0] FinalM;
+	wire [`MANTISSA:0] PreShiftM;
+	
+	assign PreShiftM = GRS ? RoundMP : RoundM ;	// Round up if R and (G or S)
+	
+	// Post rounding normalization (potential one bit shift> use shifted mantissa if there is overflow)
+	assign FinalM = (PreShiftM[`MANTISSA] ? {1'b0, PreShiftM[`MANTISSA:1]} : PreShiftM[`MANTISSA:0]) ;
+	
+	assign FinalE = (PreShiftM[`MANTISSA] ? RoundEP : RoundE) ; // Increment exponent if a shift was done
+	
+	assign Z = {Sp, FinalE[`EXPONENT-1:0], FinalM[`MANTISSA-1:0]} ;   // Putting the pieces together
+	assign Flags = InputExc[4:0];
+
+endmodule
+`endif
+
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+// Floating point 16-bit adder
+// This is a heavily modified version of:
+// https://github.com/fbrosser/DSP48E1-FP/tree/master/src/FP_AddSub
+// Original author: Fredrik Brosser
+// Abridged by: Samidh Mehta
+///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////
+`ifndef complex_dsp
+
+module FPAddSub(
+		//bf16,
+		clk,
+		rst,
+		a,
+		b,
+		operation,			// 0 add, 1 sub
+		result,
+		flags
+	);
+	//input bf16; //1 for Bfloat16, 0 for IEEE half precision
+
+	// Clock and reset
+	input clk ;										// Clock signal
+	input rst ;										// Reset (active high, resets pipeline registers)
+	
+	// Input ports
+	input [`DWIDTH-1:0] a ;								// Input A, a 32-bit floating point number
+	input [`DWIDTH-1:0] b ;								// Input B, a 32-bit floating point number
+	input operation ;								// Operation select signal
+	
+	// Output ports
+	output [`DWIDTH-1:0] result ;						// Result of the operation
+	output [4:0] flags ;							// Flags indicating exceptions according to IEEE754
+	
+	// Pipeline Registers
+	//reg [79:0] pipe_1;							// Pipeline register PreAlign->Align1
+	reg [2*`EXPONENT + 2*`DWIDTH + 5:0] pipe_1;							// Pipeline register PreAlign->Align1
+
+	//reg [67:0] pipe_2;							// Pipeline register Align1->Align3
+	//reg [2*`EXPONENT+ 2*`MANTISSA + 8:0] pipe_2;							// Pipeline register Align1->Align3
+	wire [2*`EXPONENT+ 2*`MANTISSA + 8:0] pipe_2;
+
+	//reg [76:0] pipe_3;	68						// Pipeline register Align1->Align3
+	reg [2*`EXPONENT+ 2*`MANTISSA + 9:0] pipe_3;							// Pipeline register Align1->Align3
+
+	//reg [69:0] pipe_4;							// Pipeline register Align3->Execute
+	//reg [2*`EXPONENT+ 2*`MANTISSA + 9:0] pipe_4;							// Pipeline register Align3->Execute
+	wire [2*`EXPONENT+ 2*`MANTISSA + 9:0] pipe_4;
+	
+	//reg [51:0] pipe_5;							// Pipeline register Execute->Normalize
+	reg [`DWIDTH+`EXPONENT+11:0] pipe_5;							// Pipeline register Execute->Normalize
+
+	//reg [56:0] pipe_6;							// Pipeline register Nomalize->NormalizeShift1
+	//reg [`DWIDTH+`EXPONENT+16:0] pipe_6;							// Pipeline register Nomalize->NormalizeShift1
+	wire [`DWIDTH+`EXPONENT+16:0] pipe_6;
+
+	//reg [56:0] pipe_7;							// Pipeline register NormalizeShift2->NormalizeShift3
+	//reg [`DWIDTH+`EXPONENT+16:0] pipe_7;							// Pipeline register NormalizeShift2->NormalizeShift3
+	wire [`DWIDTH+`EXPONENT+16:0] pipe_7;
+	//reg [54:0] pipe_8;							// Pipeline register NormalizeShift3->Round
+	reg [`EXPONENT*2+`MANTISSA+15:0] pipe_8;							// Pipeline register NormalizeShift3->Round
+
+	//reg [40:0] pipe_9;							// Pipeline register NormalizeShift3->Round
+	//reg [`DWIDTH+8:0] pipe_9;							// Pipeline register NormalizeShift3->Round
+	wire [`DWIDTH+8:0] pipe_9;
+
+	// Internal wires between modules
+	wire [`DWIDTH-2:0] Aout_0 ;							// A - sign
+	wire [`DWIDTH-2:0] Bout_0 ;							// B - sign
+	wire Opout_0 ;									// A's sign
+	wire Sa_0 ;										// A's sign
+	wire Sb_0 ;										// B's sign
+	wire MaxAB_1 ;									// Indicates the larger of A and B(0/A, 1/B)
+	wire [`EXPONENT-1:0] CExp_1 ;							// Common Exponent
+	wire [`EXPONENT-1:0] Shift_1 ;							// Number of steps to smaller mantissa shift right (align)
+	wire [`MANTISSA-1:0] Mmax_1 ;							// Larger mantissa
+	wire [4:0] InputExc_0 ;						// Input numbers are exceptions
+	wire [2*`EXPONENT-1:0] ShiftDet_0 ;
+	wire [`MANTISSA-1:0] MminS_1 ;						// Smaller mantissa after 0/16 shift
+	wire [`MANTISSA:0] MminS_2 ;						// Smaller mantissa after 0/4/8/12 shift
+	wire [`MANTISSA:0] Mmin_3 ;							// Smaller mantissa after 0/1/2/3 shift
+	wire [`DWIDTH:0] Sum_4 ;
+	wire PSgn_4 ;
+	wire Opr_4 ;
+	wire [`EXPONENT-1:0] Shift_5 ;							// Number of steps to shift sum left (normalize)
+	wire [`DWIDTH:0] SumS_5 ;							// Sum after 0/16 shift
+	wire [`DWIDTH:0] SumS_6 ;							// Sum after 0/16 shift
+	wire [`DWIDTH:0] SumS_7 ;							// Sum after 0/16 shift
+	wire [`MANTISSA-1:0] NormM_8 ;						// Normalized mantissa
+	wire [`EXPONENT:0] NormE_8;							// Adjusted exponent
+	wire ZeroSum_8 ;								// Zero flag
+	wire NegE_8 ;									// Flag indicating negative exponent
+	wire R_8 ;										// Round bit
+	wire S_8 ;										// Final sticky bit
+	wire FG_8 ;										// Final sticky bit
+	wire [`DWIDTH-1:0] P_int ;
+	wire EOF ;
+	
+	// Prepare the operands for alignment and check for exceptions
+	FPAddSub_PrealignModule PrealignModule
+	(	// Inputs
+		a, b, operation,
+		// Outputs
+		Sa_0, Sb_0, ShiftDet_0[2*`EXPONENT-1:0], InputExc_0[4:0], Aout_0[`DWIDTH-2:0], Bout_0[`DWIDTH-2:0], Opout_0) ;
+		
+	// Prepare the operands for alignment and check for exceptions
+	FPAddSub_AlignModule AlignModule
+	(	// Inputs
+		pipe_1[2*`EXPONENT + 2*`DWIDTH + 4: 2*`EXPONENT +`DWIDTH + 6], pipe_1[2*`EXPONENT +`DWIDTH + 5 :  2*`EXPONENT +7], pipe_1[2*`EXPONENT+4:5],
+		// Outputs
+		CExp_1[`EXPONENT-1:0], MaxAB_1, Shift_1[`EXPONENT-1:0], MminS_1[`MANTISSA-1:0], Mmax_1[`MANTISSA-1:0]) ;	
+
+	// Alignment Shift Stage 1
+	FPAddSub_AlignShift1 AlignShift1
+	(  // Inputs
+		//bf16, 
+		pipe_2[`MANTISSA-1:0], pipe_2[`EXPONENT+ 2*`MANTISSA + 4 : 2*`MANTISSA + 7],
+		// Outputs
+		MminS_2[`MANTISSA:0]) ;
+
+	// Alignment Shift Stage 3 and compution of guard and sticky bits
+	FPAddSub_AlignShift2 AlignShift2  
+	(  // Inputs
+		pipe_3[`MANTISSA:0], pipe_3[2*`MANTISSA+7:2*`MANTISSA+6],
+		// Outputs
+		Mmin_3[`MANTISSA:0]) ;
+						
+	// Perform mantissa addition
+	FPAddSub_ExecutionModule ExecutionModule
+	(  // Inputs
+		pipe_4[`MANTISSA*2+5:`MANTISSA+6], pipe_4[`MANTISSA:0], pipe_4[2*`EXPONENT+ 2*`MANTISSA + 8], pipe_4[2*`EXPONENT+ 2*`MANTISSA + 7], pipe_4[2*`EXPONENT+ 2*`MANTISSA + 6], pipe_4[2*`EXPONENT+ 2*`MANTISSA + 9],
+		// Outputs
+		Sum_4[`DWIDTH:0], PSgn_4, Opr_4) ;
+	
+	// Prepare normalization of result
+	FPAddSub_NormalizeModule NormalizeModule
+	(  // Inputs
+		pipe_5[`DWIDTH:0], 
+		// Outputs
+		SumS_5[`DWIDTH:0], Shift_5[4:0]) ;
+					
+	// Normalization Shift Stage 1
+	FPAddSub_NormalizeShift1 NormalizeShift1
+	(  // Inputs
+		pipe_6[`DWIDTH:0], pipe_6[`DWIDTH+`EXPONENT+14:`DWIDTH+`EXPONENT+11],
+		// Outputs
+		SumS_7[`DWIDTH:0]) ;
+		
+	// Normalization Shift Stage 3 and final guard, sticky and round bits
+	FPAddSub_NormalizeShift2 NormalizeShift2
+	(  // Inputs
+		pipe_7[`DWIDTH:0], pipe_7[`DWIDTH+`EXPONENT+5:`DWIDTH+6], pipe_7[`DWIDTH+`EXPONENT+15:`DWIDTH+`EXPONENT+11],
+		// Outputs
+		NormM_8[`MANTISSA-1:0], NormE_8[`EXPONENT:0], ZeroSum_8, NegE_8, R_8, S_8, FG_8) ;
+
+	// Round and put result together
+	FPAddSub_RoundModule RoundModule
+	(  // Inputs
+		 pipe_8[3], pipe_8[4+`EXPONENT:4], pipe_8[`EXPONENT+`MANTISSA+4:5+`EXPONENT], pipe_8[1], pipe_8[0], pipe_8[`EXPONENT*2+`MANTISSA+15], pipe_8[`EXPONENT*2+`MANTISSA+12], pipe_8[`EXPONENT*2+`MANTISSA+11], pipe_8[`EXPONENT*2+`MANTISSA+14], pipe_8[`EXPONENT*2+`MANTISSA+10], 
+		// Outputs
+		P_int[`DWIDTH-1:0], EOF) ;
+	
+	// Check for exceptions
+	FPAddSub_ExceptionModule Exceptionmodule
+	(  // Inputs
+		pipe_9[8+`DWIDTH:9], pipe_9[8], pipe_9[7], pipe_9[6], pipe_9[5:1], pipe_9[0], 
+		// Outputs
+		result[`DWIDTH-1:0], flags[4:0]) ;			
+	
+
+assign pipe_2 = {pipe_1[2*`EXPONENT + 2*`DWIDTH + 5], pipe_1[2*`EXPONENT +6:2*`EXPONENT +5], MaxAB_1, CExp_1[`EXPONENT-1:0], Shift_1[`EXPONENT-1:0], Mmax_1[`MANTISSA-1:0], pipe_1[4:0], MminS_1[`MANTISSA-1:0]} ;
+assign pipe_4 = {pipe_3[2*`EXPONENT+ 2*`MANTISSA + 9:`MANTISSA+1], Mmin_3[`MANTISSA:0]} ;
+assign pipe_6 = {pipe_5[`DWIDTH+`EXPONENT+11], Shift_5[4:0], pipe_5[`DWIDTH+`EXPONENT+10:`DWIDTH+1], SumS_5[`DWIDTH:0]} ;
+assign pipe_7 = {pipe_6[`DWIDTH+`EXPONENT+16:`DWIDTH+1], SumS_7[`DWIDTH:0]} ;
+assign pipe_9 = {P_int[`DWIDTH-1:0], pipe_8[2], pipe_8[1], pipe_8[0], pipe_8[`EXPONENT+`MANTISSA+9:`EXPONENT+`MANTISSA+5], EOF} ;
+
+	always @ (posedge clk) begin	
+		if(rst) begin
+			pipe_1 <= 0;
+			//pipe_2 <= 0;
+			pipe_3 <= 0;
+			//pipe_4 <= 0;
+			pipe_5 <= 0;
+			//pipe_6 <= 0;
+			//pipe_7 <= 0;
+			pipe_8 <= 0;
+			//pipe_9 <= 0;
+		end 
+		else begin
+/* PIPE_1:
+	[2*`EXPONENT + 2*`DWIDTH + 5]  Opout_0
+	[2*`EXPONENT + 2*`DWIDTH + 4: 2*`EXPONENT +`DWIDTH + 6] A_out0
+	[2*`EXPONENT +`DWIDTH + 5 :  2*`EXPONENT +7] Bout_0
+	[2*`EXPONENT +6] Sa_0
+	[2*`EXPONENT +5] Sb_0
+	[2*`EXPONENT +4 : 5] ShiftDet_0
+	[4:0] Input Exc
+*/
+			pipe_1 <= {Opout_0, Aout_0[`DWIDTH-2:0], Bout_0[`DWIDTH-2:0], Sa_0, Sb_0, ShiftDet_0[2*`EXPONENT -1:0], InputExc_0[4:0]} ;	
+/* PIPE_2
+[2*`EXPONENT+ 2*`MANTISSA + 8] operation
+[2*`EXPONENT+ 2*`MANTISSA + 7] Sa_0
+[2*`EXPONENT+ 2*`MANTISSA + 6] Sb_0
+[2*`EXPONENT+ 2*`MANTISSA + 5] MaxAB_0
+[2*`EXPONENT+ 2*`MANTISSA + 4:`EXPONENT+ 2*`MANTISSA + 5] CExp_0
+[`EXPONENT+ 2*`MANTISSA + 4 : 2*`MANTISSA + 5] Shift_0
+[2*`MANTISSA + 4:`MANTISSA + 5] Mmax_0
+[`MANTISSA + 4 : `MANTISSA] InputExc_0
+[`MANTISSA-1:0] MminS_1
+*/
+			//pipe_2 <= {pipe_1[2*`EXPONENT + 2*`DWIDTH + 5], pipe_1[2*`EXPONENT +6:2*`EXPONENT +5], MaxAB_1, CExp_1[`EXPONENT-1:0], Shift_1[`EXPONENT-1:0], Mmax_1[`MANTISSA-1:0], pipe_1[4:0], MminS_1[`MANTISSA-1:0]} ;	
+/* PIPE_3
+[2*`EXPONENT+ 2*`MANTISSA + 9] operation
+[2*`EXPONENT+ 2*`MANTISSA + 8] Sa_0
+[2*`EXPONENT+ 2*`MANTISSA + 7] Sb_0
+[2*`EXPONENT+ 2*`MANTISSA + 6] MaxAB_0
+[2*`EXPONENT+ 2*`MANTISSA + 5:`EXPONENT+ 2*`MANTISSA + 6] CExp_0
+[`EXPONENT+ 2*`MANTISSA + 5 : 2*`MANTISSA + 6] Shift_0
+[2*`MANTISSA + 5:`MANTISSA + 6] Mmax_0
+[`MANTISSA + 5 : `MANTISSA + 1] InputExc_0
+[`MANTISSA:0] MminS_2
+*/
+			pipe_3 <= {pipe_2[2*`EXPONENT+ 2*`MANTISSA + 8:`MANTISSA], MminS_2[`MANTISSA:0]} ;	
+/* PIPE_4
+[2*`EXPONENT+ 2*`MANTISSA + 9] operation
+[2*`EXPONENT+ 2*`MANTISSA + 8] Sa_0
+[2*`EXPONENT+ 2*`MANTISSA + 7] Sb_0
+[2*`EXPONENT+ 2*`MANTISSA + 6] MaxAB_0
+[2*`EXPONENT+ 2*`MANTISSA + 5:`EXPONENT+ 2*`MANTISSA + 6] CExp_0
+[`EXPONENT+ 2*`MANTISSA + 5 : 2*`MANTISSA + 6] Shift_0
+[2*`MANTISSA + 5:`MANTISSA + 6] Mmax_0
+[`MANTISSA + 5 : `MANTISSA + 1] InputExc_0
+[`MANTISSA:0] MminS_3
+*/				
+			//pipe_4 <= {pipe_3[2*`EXPONENT+ 2*`MANTISSA + 9:`MANTISSA+1], Mmin_3[`MANTISSA:0]} ;	
+/* PIPE_5 :
+[`DWIDTH+ `EXPONENT + 11] operation
+[`DWIDTH+ `EXPONENT + 10] PSgn_4
+[`DWIDTH+ `EXPONENT + 9] Opr_4
+[`DWIDTH+ `EXPONENT + 8] Sa_0
+[`DWIDTH+ `EXPONENT + 7] Sb_0
+[`DWIDTH+ `EXPONENT + 6] MaxAB_0
+[`DWIDTH+ `EXPONENT + 5 :`DWIDTH+6] CExp_0
+[`DWIDTH+5:`DWIDTH+1] InputExc_0
+[`DWIDTH:0] Sum_4
+*/					
+			pipe_5 <= {pipe_4[2*`EXPONENT+ 2*`MANTISSA + 9], PSgn_4, Opr_4, pipe_4[2*`EXPONENT+ 2*`MANTISSA + 8:`EXPONENT+ 2*`MANTISSA + 6], pipe_4[`MANTISSA+5:`MANTISSA+1], Sum_4[`DWIDTH:0]} ;
+/* PIPE_6 :
+[`DWIDTH+ `EXPONENT + 16] operation
+[`DWIDTH+ `EXPONENT + 15:`DWIDTH+ `EXPONENT + 11] Shift_5
+[`DWIDTH+ `EXPONENT + 10] PSgn_4
+[`DWIDTH+ `EXPONENT + 9] Opr_4
+[`DWIDTH+ `EXPONENT + 8] Sa_0
+[`DWIDTH+ `EXPONENT + 7] Sb_0
+[`DWIDTH+ `EXPONENT + 6] MaxAB_0
+[`DWIDTH+ `EXPONENT + 5 :`DWIDTH+6] CExp_0
+[`DWIDTH+5:`DWIDTH+1] InputExc_0
+[`DWIDTH:0] Sum_4
+*/				
+			//pipe_6 <= {pipe_5[`DWIDTH+`EXPONENT+11], Shift_5[4:0], pipe_5[`DWIDTH+`EXPONENT+10:`DWIDTH+1], SumS_5[`DWIDTH:0]} ;	
+/* PIPE_7 :
+[`DWIDTH+ `EXPONENT + 16] operation
+[`DWIDTH+ `EXPONENT + 15:`DWIDTH+ `EXPONENT + 11] Shift_5
+[`DWIDTH+ `EXPONENT + 10] PSgn_4
+[`DWIDTH+ `EXPONENT + 9] Opr_4
+[`DWIDTH+ `EXPONENT + 8] Sa_0
+[`DWIDTH+ `EXPONENT + 7] Sb_0
+[`DWIDTH+ `EXPONENT + 6] MaxAB_0
+[`DWIDTH+ `EXPONENT + 5 :`DWIDTH+6] CExp_0
+[`DWIDTH+5:`DWIDTH+1] InputExc_0
+[`DWIDTH:0] Sum_4
+*/						
+			//pipe_7 <= {pipe_6[`DWIDTH+`EXPONENT+16:`DWIDTH+1], SumS_7[`DWIDTH:0]} ;	
+/* PIPE_8:
+[2*`EXPONENT + `MANTISSA + 15] FG_8 
+[2*`EXPONENT + `MANTISSA + 14] operation
+[2*`EXPONENT + `MANTISSA + 13] PSgn_4
+[2*`EXPONENT + `MANTISSA + 12] Sa_0
+[2*`EXPONENT + `MANTISSA + 11] Sb_0
+[2*`EXPONENT + `MANTISSA + 10] MaxAB_0
+[2*`EXPONENT + `MANTISSA + 9:`EXPONENT + `MANTISSA + 10] CExp_0
+[`EXPONENT + `MANTISSA + 9:`EXPONENT + `MANTISSA + 5] InputExc_8
+[`EXPONENT + `MANTISSA + 4 :`EXPONENT + 5] NormM_8 
+[`EXPONENT + 4 :4] NormE_8
+[3] ZeroSum_8
+[2] NegE_8
+[1] R_8
+[0] S_8
+*/				
+			pipe_8 <= {FG_8, pipe_7[`DWIDTH+`EXPONENT+16], pipe_7[`DWIDTH+`EXPONENT+10], pipe_7[`DWIDTH+`EXPONENT+8:`DWIDTH+1], NormM_8[`MANTISSA-1:0], NormE_8[`EXPONENT:0], ZeroSum_8, NegE_8, R_8, S_8} ;	
+/* pipe_9:
+[`DWIDTH + 8 :9] P_int
+[8] NegE_8
+[7] R_8
+[6] S_8
+[5:1] InputExc_8
+[0] EOF
+*/				
+			//pipe_9 <= {P_int[`DWIDTH-1:0], pipe_8[2], pipe_8[1], pipe_8[0], pipe_8[`EXPONENT+`MANTISSA+9:`EXPONENT+`MANTISSA+5], EOF} ;	
+		end
+	end		
+	
+endmodule
+
+
+//
+// Description:	 	The pre-alignment module is responsible for taking the inputs
+//							apart and checking the parts for exceptions.
+//							The exponent difference is also calculated in this module.
+//
+
+
+module FPAddSub_PrealignModule(
+		A,
+		B,
+		operation,
+		Sa,
+		Sb,
+		ShiftDet,
+		InputExc,
+		Aout,
+		Bout,
+		Opout
+	);
+	
+	// Input ports
+	input [`DWIDTH-1:0] A ;										// Input A, a 32-bit floating point number
+	input [`DWIDTH-1:0] B ;										// Input B, a 32-bit floating point number
+	input operation ;
+	
+	// Output ports
+	output Sa ;												// A's sign
+	output Sb ;												// B's sign
+	output [2*`EXPONENT-1:0] ShiftDet ;
+	output [4:0] InputExc ;								// Input numbers are exceptions
+	output [`DWIDTH-2:0] Aout ;
+	output [`DWIDTH-2:0] Bout ;
+	output Opout ;
+	
+	// Internal signals									// If signal is high...
+	wire ANaN ;												// A is a NaN (Not-a-Number)
+	wire BNaN ;												// B is a NaN
+	wire AInf ;												// A is infinity
+	wire BInf ;												// B is infinity
+	wire [`EXPONENT-1:0] DAB ;										// ExpA - ExpB					
+	wire [`EXPONENT-1:0] DBA ;										// ExpB - ExpA	
+	
+	assign ANaN = &(A[`DWIDTH-2:`DWIDTH-1-`EXPONENT]) & |(A[`MANTISSA-1:0]) ;		// All one exponent and not all zero mantissa - NaN
+	assign BNaN = &(B[`DWIDTH-2:`DWIDTH-1-`EXPONENT]) & |(B[`MANTISSA-1:0]);		// All one exponent and not all zero mantissa - NaN
+	assign AInf = &(A[`DWIDTH-2:`DWIDTH-1-`EXPONENT]) & ~|(A[`MANTISSA-1:0]) ;	// All one exponent and all zero mantissa - Infinity
+	assign BInf = &(B[`DWIDTH-2:`DWIDTH-1-`EXPONENT]) & ~|(B[`MANTISSA-1:0]) ;	// All one exponent and all zero mantissa - Infinity
+	
+	// Put all flags into exception vector
+	assign InputExc = {(ANaN | BNaN | AInf | BInf), ANaN, BNaN, AInf, BInf} ;
+	
+	//assign DAB = (A[30:23] - B[30:23]) ;
+	//assign DBA = (B[30:23] - A[30:23]) ;
+	assign DAB = (A[`DWIDTH-2:`MANTISSA] + ~(B[`DWIDTH-2:`MANTISSA]) + 1) ;
+	assign DBA = (B[`DWIDTH-2:`MANTISSA] + ~(A[`DWIDTH-2:`MANTISSA]) + 1) ;
+	
+	assign Sa = A[`DWIDTH-1] ;									// A's sign bit
+	assign Sb = B[`DWIDTH-1] ;									// B's sign	bit
+	assign ShiftDet = {DBA[`EXPONENT-1:0], DAB[`EXPONENT-1:0]} ;		// Shift data
+	assign Opout = operation ;
+	assign Aout = A[`DWIDTH-2:0] ;
+	assign Bout = B[`DWIDTH-2:0] ;
+	
+endmodule
+
+
+//
+// Description:	 	The alignment module determines the larger input operand and
+//							sets the mantissas, shift and common exponent accordingly.
+//
+
+
+module FPAddSub_AlignModule (
+		A,
+		B,
+		ShiftDet,
+		CExp,
+		MaxAB,
+		Shift,
+		Mmin,
+		Mmax
+	);
+	
+	// Input ports
+	input [`DWIDTH-2:0] A ;								// Input A, a 32-bit floating point number
+	input [`DWIDTH-2:0] B ;								// Input B, a 32-bit floating point number
+	input [2*`EXPONENT-1:0] ShiftDet ;
+	
+	// Output ports
+	output [`EXPONENT-1:0] CExp ;							// Common Exponent
+	output MaxAB ;									// Incidates larger of A and B (0/A, 1/B)
+	output [`EXPONENT-1:0] Shift ;							// Number of steps to smaller mantissa shift right
+	output [`MANTISSA-1:0] Mmin ;							// Smaller mantissa 
+	output [`MANTISSA-1:0] Mmax ;							// Larger mantissa
+	
+	// Internal signals
+	//wire BOF ;										// Check for shifting overflow if B is larger
+	//wire AOF ;										// Check for shifting overflow if A is larger
+	
+	assign MaxAB = (A[`DWIDTH-2:0] < B[`DWIDTH-2:0]) ;	
+	//assign BOF = ShiftDet[9:5] < 25 ;		// Cannot shift more than 25 bits
+	//assign AOF = ShiftDet[4:0] < 25 ;		// Cannot shift more than 25 bits
+	
+	// Determine final shift value
+	//assign Shift = MaxAB ? (BOF ? ShiftDet[9:5] : 5'b11001) : (AOF ? ShiftDet[4:0] : 5'b11001) ;
+	
+	assign Shift = MaxAB ? ShiftDet[2*`EXPONENT-1:`EXPONENT] : ShiftDet[`EXPONENT-1:0] ;
+	
+	// Take out smaller mantissa and append shift space
+	assign Mmin = MaxAB ? A[`MANTISSA-1:0] : B[`MANTISSA-1:0] ; 
+	
+	// Take out larger mantissa	
+	assign Mmax = MaxAB ? B[`MANTISSA-1:0]: A[`MANTISSA-1:0] ;	
+	
+	// Common exponent
+	assign CExp = (MaxAB ? B[`MANTISSA+`EXPONENT-1:`MANTISSA] : A[`MANTISSA+`EXPONENT-1:`MANTISSA]) ;		
+	
+endmodule
+
+
+// Description:	 Alignment shift stage 1, performs 16|12|8|4 shift
+//
+
+
+// ONLY THIS MODULE IS HARDCODED for half precision fp16 and bfloat16
+module FPAddSub_AlignShift1(
+		//bf16,
+		MminP,
+		Shift,
+		Mmin
+	);
+	
+	// Input ports
+	//input bf16;
+	input [`MANTISSA-1:0] MminP ;						// Smaller mantissa after 16|12|8|4 shift
+	input [`EXPONENT-3:0] Shift ;						// Shift amount. Last 2 bits of shifting are done in next stage. Hence, we have [`EXPONENT - 2] bits
+	
+	// Output ports
+	output [`MANTISSA:0] Mmin ;						// The smaller mantissa
+	
+
+	wire bf16;
+	assign bf16 = 1'b1; //hardcoding to 1, to avoid ODIN issue. a `ifdef here wasn't working. apparently, nested `ifdefs don't work
+
+	// Internal signals
+	reg	  [`MANTISSA:0]		Lvl1;
+	reg	  [`MANTISSA:0]		Lvl2;
+	wire    [2*`MANTISSA+1:0]    Stage1;	
+	integer           i;                // Loop variable
+
+	always @(*) begin
+		if (bf16 == 1'b1) begin						
+//hardcoding for bfloat16
+	//For bfloat16, we can shift the mantissa by a max of 7 bits since mantissa has a width of 7. 
+	//Hence if either, bit[3]/bit[4]/bit[5]/bit[6]/bit[7] is 1, we can make it 0. This corresponds to bits [5:1] in our updated shift which doesn't contain last 2 bits.
+		//Lvl1 <= (Shift[1]|Shift[2]|Shift[3]|Shift[4]|Shift[5]) ? {temp_0} : {1'b1, MminP};  // MANTISSA + 1 width	
+		Lvl1 <= (|Shift[`EXPONENT-3:1]) ? 'd0 : {1'b1, MminP};  // MANTISSA + 1 width	
+		end
+		else begin
+		//for half precision fp16, 10 bits can be shifted. Hence, only shifts till 10 (01010)can be made. 
+		Lvl1 <= Shift[2] ? 'd0 : {1'b1, MminP};
+		end
+	end
+	
+	assign Stage1 = {Lvl1, Lvl1}; //2*MANTISSA + 2 width
+
+	always @(*) begin    					// Rotate {0 | 4 } bits
+	if(bf16 == 1'b1) begin
+	  case (Shift[0])
+			// Rotate by 0	
+			1'b0: Lvl2 <= Stage1[`MANTISSA:0];       			
+			// Rotate by 4	
+			1'b1: Lvl2 <= Stage1[`MANTISSA+4:4];
+			default: Lvl2 <= Stage1[`MANTISSA+4:4];
+	  endcase
+	end
+	else begin
+	  case (Shift[1:0])					// Rotate {0 | 4 | 8} bits
+			// Rotate by 0	
+			2'b00: Lvl2 <= Stage1[`MANTISSA:0];       			
+			// Rotate by 4	
+			2'b01: Lvl2 <= Stage1[`MANTISSA+4:4];
+			// Rotate by 8
+			2'b10: Lvl2 <= Stage1[`MANTISSA+8:8];
+			// Rotate by 12	
+			2'b11: Lvl2[`MANTISSA: 0] <= 0; 
+			default: Lvl2[`MANTISSA: 0] <= 0; 
+	  endcase
+	end
+	end
+
+	// Assign output to next shift stage
+	assign Mmin = Lvl2;
+	
+endmodule
+
+
+// Description:	 Alignment shift stage 2, performs 3|2|1 shift
+//
+
+
+module FPAddSub_AlignShift2(
+		MminP,
+		Shift,
+		Mmin
+	);
+	
+	// Input ports
+	input [`MANTISSA:0] MminP ;						// Smaller mantissa after 16|12|8|4 shift
+	input [1:0] Shift ;						// Shift amount. Last 2 bits
+	
+	// Output ports
+	output [`MANTISSA:0] Mmin ;						// The smaller mantissa
+	
+	// Internal Signal
+	reg	  [`MANTISSA:0]		Lvl3;
+	wire    [2*`MANTISSA+1:0]    Stage2;	
+	integer           j;               // Loop variable
+	
+	assign Stage2 = {MminP, MminP};
+
+	always @(*) begin    // Rotate {0 | 1 | 2 | 3} bits
+	  case (Shift[1:0])
+			// Rotate by 0
+			2'b00: Lvl3 <= Stage2[`MANTISSA:0];   
+			// Rotate by 1
+			2'b01: Lvl3 <= Stage2[`MANTISSA+1:1];
+			// Rotate by 2
+			2'b10: Lvl3 <= Stage2[`MANTISSA+2:2];
+			// Rotate by 3
+			2'b11: Lvl3 <= Stage2[`MANTISSA+3:3]; 
+	  endcase
+	end
+	
+	// Assign output
+	assign Mmin = Lvl3;						// Take out smaller mantissa				
+
+endmodule
+
+
+//
+// Description:	 Module that executes the addition or subtraction on mantissas.
+//
+
+
+module FPAddSub_ExecutionModule(
+		Mmax,
+		Mmin,
+		Sa,
+		Sb,
+		MaxAB,
+		OpMode,
+		Sum,
+		PSgn,
+		Opr
+    );
+
+	// Input ports
+	input [`MANTISSA-1:0] Mmax ;					// The larger mantissa
+	input [`MANTISSA:0] Mmin ;					// The smaller mantissa
+	input Sa ;								// Sign bit of larger number
+	input Sb ;								// Sign bit of smaller number
+	input MaxAB ;							// Indicates the larger number (0/A, 1/B)
+	input OpMode ;							// Operation to be performed (0/Add, 1/Sub)
+	
+	// Output ports
+	output [`DWIDTH:0] Sum ;					// The result of the operation
+	output PSgn ;							// The sign for the result
+	output Opr ;							// The effective (performed) operation
+
+	wire [`EXPONENT-1:0]temp_1;
+
+	assign Opr = (OpMode^Sa^Sb); 		// Resolve sign to determine operation
+	assign temp_1 = 0;
+	// Perform effective operation
+//SAMIDH_UNSURE 5--> 8
+
+	assign Sum = (OpMode^Sa^Sb) ? ({1'b1, Mmax, temp_1} - {Mmin, temp_1}) : ({1'b1, Mmax, temp_1} + {Mmin, temp_1}) ;
+	
+	// Assign result sign
+	assign PSgn = (MaxAB ? Sb : Sa) ;
+
+endmodule
+
+
+//
+// Description:	 Determine the normalization shift amount and perform 16-shift
+//
+
+
+module FPAddSub_NormalizeModule(
+		Sum,
+		Mmin,
+		Shift
+    );
+
+	// Input ports
+	input [`DWIDTH:0] Sum ;					// Mantissa sum including hidden 1 and GRS
+	
+	// Output ports
+	output [`DWIDTH:0] Mmin ;					// Mantissa after 16|0 shift
+	output [4:0] Shift ;					// Shift amount
+	//Changes in this doesn't matter since even Bfloat16 can't go beyond 7 shift to the mantissa (only 3 bits valid here)  
+	// Determine normalization shift amount by finding leading nought
+	assign Shift =  ( 
+		Sum[16] ? 5'b00000 :	 
+		Sum[15] ? 5'b00001 : 
+		Sum[14] ? 5'b00010 : 
+		Sum[13] ? 5'b00011 : 
+		Sum[12] ? 5'b00100 : 
+		Sum[11] ? 5'b00101 : 
+		Sum[10] ? 5'b00110 : 
+		Sum[9] ? 5'b00111 :
+		Sum[8] ? 5'b01000 :
+		Sum[7] ? 5'b01001 :
+		Sum[6] ? 5'b01010 :
+		Sum[5] ? 5'b01011 :
+		Sum[4] ? 5'b01100 : 5'b01101
+	//	Sum[19] ? 5'b01101 :
+	//	Sum[18] ? 5'b01110 :
+	//	Sum[17] ? 5'b01111 :
+	//	Sum[16] ? 5'b10000 :
+	//	Sum[15] ? 5'b10001 :
+	//	Sum[14] ? 5'b10010 :
+	//	Sum[13] ? 5'b10011 :
+	//	Sum[12] ? 5'b10100 :
+	//	Sum[11] ? 5'b10101 :
+	//	Sum[10] ? 5'b10110 :
+	//	Sum[9] ? 5'b10111 :
+	//	Sum[8] ? 5'b11000 :
+	//	Sum[7] ? 5'b11001 : 5'b11010
+	);
+	
+	reg	  [`DWIDTH:0]		Lvl1;
+	
+	always @(*) begin
+		// Rotate by 16?
+		Lvl1 <= Shift[4] ? {Sum[8:0], 8'b00000000} : Sum; 
+	end
+	
+	// Assign outputs
+	assign Mmin = Lvl1;						// Take out smaller mantissa
+
+endmodule
+
+
+// Description:	 Normalization shift stage 1, performs 12|8|4|3|2|1|0 shift
+//
+//Hardcoding loop start and end values of i. To avoid ODIN limitations. i=`DWIDTH*2+1 wasn't working.
+
+module FPAddSub_NormalizeShift1(
+		MminP,
+		Shift,
+		Mmin
+	);
+	
+	// Input ports
+	input [`DWIDTH:0] MminP ;						// Smaller mantissa after 16|12|8|4 shift
+	input [3:0] Shift ;						// Shift amount
+	
+	// Output ports
+	output [`DWIDTH:0] Mmin ;						// The smaller mantissa
+	
+	reg	  [`DWIDTH:0]		Lvl2;
+	wire    [2*`DWIDTH+1:0]    Stage1;	
+	reg	  [`DWIDTH:0]		Lvl3;
+	wire    [2*`DWIDTH+1:0]    Stage2;	
+	integer           i;               	// Loop variable
+	
+	assign Stage1 = {MminP, MminP};
+
+	always @(*) begin    					// Rotate {0 | 4 | 8 | 12} bits
+	  case (Shift[3:2])
+			// Rotate by 0
+			2'b00: Lvl2 <= Stage1[`DWIDTH:0];       		
+			// Rotate by 4
+			2'b01: Lvl2 <= Stage1[28:13];
+			// Rotate by 8
+			2'b10: Lvl2 <= Stage1[24:9];
+			// Rotate by 12
+			2'b11: Lvl2 <= Stage1[20:5];
+			default: Lvl2 <= Stage1[`DWIDTH:0];
+	  endcase
+	end
+	
+	assign Stage2 = {Lvl2, Lvl2};
+
+	always @(*) begin   				 		// Rotate {0 | 1 | 2 | 3} bits
+	  case (Shift[1:0])
+			// Rotate by 0
+			2'b00: Lvl3 <= Stage2[`DWIDTH:0];
+			// Rotate by 1
+			2'b01: Lvl3 <= Stage2[31:16];
+			// Rotate by 2
+			2'b10: Lvl3 <= Stage2[30:15];
+			// Rotate by 3
+			2'b11: Lvl3 <= Stage2[29:14];
+			default: Lvl3 <= Stage2[`DWIDTH:0];
+	  endcase
+	end
+	
+	// Assign outputs
+	assign Mmin = Lvl3;						// Take out smaller mantissa			
+	
+endmodule
+
+
+// Description:	 Normalization shift stage 2, calculates post-normalization
+//						 mantissa and exponent, as well as the bits used in rounding		
+//
+
+
+module FPAddSub_NormalizeShift2(
+		PSSum,
+		CExp,
+		Shift,
+		NormM,
+		NormE,
+		ZeroSum,
+		NegE,
+		R,
+		S,
+		FG
+	);
+	
+	// Input ports
+	input [`DWIDTH:0] PSSum ;					// The Pre-Shift-Sum
+	input [`EXPONENT-1:0] CExp ;
+	input [4:0] Shift ;					// Amount to be shifted
+
+	// Output ports
+	output [`MANTISSA-1:0] NormM ;				// Normalized mantissa
+	output [`EXPONENT:0] NormE ;					// Adjusted exponent
+	output ZeroSum ;						// Zero flag
+	output NegE ;							// Flag indicating negative exponent
+	output R ;								// Round bit
+	output S ;								// Final sticky bit
+	output FG ;
+
+	// Internal signals
+	wire MSBShift ;						// Flag indicating that a second shift is needed
+	wire [`EXPONENT:0] ExpOF ;					// MSB set in sum indicates overflow
+	wire [`EXPONENT:0] ExpOK ;					// MSB not set, no adjustment
+	
+	// Calculate normalized exponent and mantissa, check for all-zero sum
+	assign MSBShift = PSSum[`DWIDTH] ;		// Check MSB in unnormalized sum
+	assign ZeroSum = ~|PSSum ;			// Check for all zero sum
+	assign ExpOK = CExp - Shift ;		// Adjust exponent for new normalized mantissa
+	assign NegE = ExpOK[`EXPONENT] ;			// Check for exponent overflow
+	assign ExpOF = CExp - Shift + 1'b1 ;		// If MSB set, add one to exponent(x2)
+	assign NormE = MSBShift ? ExpOF : ExpOK ;			// Check for exponent overflow
+	assign NormM = PSSum[`DWIDTH-1:`EXPONENT+1] ;		// The new, normalized mantissa
+	
+	// Also need to compute sticky and round bits for the rounding stage
+	assign FG = PSSum[`EXPONENT] ; 
+	assign R = PSSum[`EXPONENT-1] ;
+	assign S = |PSSum[`EXPONENT-2:0] ;
+	
+endmodule
+
+
+// Description:	 Performs 'Round to nearest, tie to even'-rounding on the
+//						 normalized mantissa according to the G, R, S bits. Calculates
+//						 final result and checks for exponent overflow.
+//
+
+
+module FPAddSub_RoundModule(
+		ZeroSum,
+		NormE,
+		NormM,
+		R,
+		S,
+		G,
+		Sa,
+		Sb,
+		Ctrl,
+		MaxAB,
+		Z,
+		EOF
+    );
+
+	// Input ports
+	input ZeroSum ;					// Sum is zero
+	input [`EXPONENT:0] NormE ;				// Normalized exponent
+	input [`MANTISSA-1:0] NormM ;				// Normalized mantissa
+	input R ;							// Round bit
+	input S ;							// Sticky bit
+	input G ;
+	input Sa ;							// A's sign bit
+	input Sb ;							// B's sign bit
+	input Ctrl ;						// Control bit (operation)
+	input MaxAB ;
+	
+	// Output ports
+	output [`DWIDTH-1:0] Z ;					// Final result
+	output EOF ;
+	
+	// Internal signals
+	wire [`MANTISSA:0] RoundUpM ;			// Rounded up sum with room for overflow
+	wire [`MANTISSA-1:0] RoundM ;				// The final rounded sum
+	wire [`EXPONENT:0] RoundE ;				// Rounded exponent (note extra bit due to poential overflow	)
+	wire RoundUp ;						// Flag indicating that the sum should be rounded up
+        wire FSgn;
+	wire ExpAdd ;						// May have to add 1 to compensate for overflow 
+	wire RoundOF ;						// Rounding overflow
+	
+	wire [`EXPONENT:0]temp_2;
+	assign temp_2 = 0;
+	// The cases where we need to round upwards (= adding one) in Round to nearest, tie to even
+	assign RoundUp = (G & ((R | S) | NormM[0])) ;
+	
+	// Note that in the other cases (rounding down), the sum is already 'rounded'
+	assign RoundUpM = (NormM + 1) ;								// The sum, rounded up by 1
+	assign RoundM = (RoundUp ? RoundUpM[`MANTISSA-1:0] : NormM) ; 	// Compute final mantissa	
+	assign RoundOF = RoundUp & RoundUpM[`MANTISSA] ; 				// Check for overflow when rounding up
+
+	// Calculate post-rounding exponent
+	assign ExpAdd = (RoundOF ? 1'b1 : 1'b0) ; 				// Add 1 to exponent to compensate for overflow
+	assign RoundE = ZeroSum ? temp_2 : (NormE + ExpAdd) ; 							// Final exponent
+
+	// If zero, need to determine sign according to rounding
+	assign FSgn = (ZeroSum & (Sa ^ Sb)) | (ZeroSum ? (Sa & Sb & ~Ctrl) : ((~MaxAB & Sa) | ((Ctrl ^ Sb) & (MaxAB | Sa)))) ;
+
+	// Assign final result
+	assign Z = {FSgn, RoundE[`EXPONENT-1:0], RoundM[`MANTISSA-1:0]} ;
+	
+	// Indicate exponent overflow
+	assign EOF = RoundE[`EXPONENT];
+	
+endmodule
+
+
+//
+// Description:	 Check the final result for exception conditions and set
+//						 flags accordingly.
+//
+
+
+module FPAddSub_ExceptionModule(
+		Z,
+		NegE,
+		R,
+		S,
+		InputExc,
+		EOF,
+		P,
+		Flags
+    );
+	 
+	// Input ports
+	input [`DWIDTH-1:0] Z	;					// Final product
+	input NegE ;						// Negative exponent?
+	input R ;							// Round bit
+	input S ;							// Sticky bit
+	input [4:0] InputExc ;			// Exceptions in inputs A and B
+	input EOF ;
+	
+	// Output ports
+	output [`DWIDTH-1:0] P ;					// Final result
+	output [4:0] Flags ;				// Exception flags
+	
+	// Internal signals
+	wire Overflow ;					// Overflow flag
+	wire Underflow ;					// Underflow flag
+	wire DivideByZero ;				// Divide-by-Zero flag (always 0 in Add/Sub)
+	wire Invalid ;						// Invalid inputs or result
+	wire Inexact ;						// Result is inexact because of rounding
+	
+	// Exception flags
+	
+	// Result is too big to be represented
+	assign Overflow = EOF | InputExc[1] | InputExc[0] ;
+	
+	// Result is too small to be represented
+	assign Underflow = NegE & (R | S);
+	
+	// Infinite result computed exactly from finite operands
+	assign DivideByZero = &(Z[`MANTISSA+`EXPONENT-1:`MANTISSA]) & ~|(Z[`MANTISSA+`EXPONENT-1:`MANTISSA]) & ~InputExc[1] & ~InputExc[0];
+	
+	// Invalid inputs or operation
+	assign Invalid = |(InputExc[4:2]) ;
+	
+	// Inexact answer due to rounding, overflow or underflow
+	assign Inexact = (R | S) | Overflow | Underflow;
+	
+	// Put pieces together to form final result
+	assign P = Z ;
+	
+	// Collect exception flags	
+	assign Flags = {Overflow, Underflow, DivideByZero, Invalid, Inexact} ; 	
+	
+endmodule
+
+`endif
+
+
diff --git a/third_party/vtr_flow/verilog/hard_block_include.v b/third_party/vtr_flow/verilog/hard_block_include.v
new file mode 100644
index 000000000..cc4d502c5
--- /dev/null
+++ b/third_party/vtr_flow/verilog/hard_block_include.v
@@ -0,0 +1,3 @@
+`define complex_dsp
+`define hard_mem
+
diff --git a/third_party/vtr_flow/verilog/raygentop.v b/third_party/vtr_flow/verilog/raygentop.v
new file mode 100644
index 000000000..256b3aead
--- /dev/null
+++ b/third_party/vtr_flow/verilog/raygentop.v
@@ -0,0 +1,2978 @@
+ module paj_raygentop_hierarchy_no_mem (rgwant_addr, rgwant_data, rgread_ready, rgaddr_ready, rgdata_ready, rgwant_read, rgdatain, rgdataout, rgaddrin, rgCont, rgStat, rgCfgData, rgwant_CfgData, rgCfgData_ready, tm3_sram_data_in, tm3_sram_data_out, tm3_sram_addr, tm3_sram_we, tm3_sram_oe, tm3_sram_adsp, clk, fbdata, fbdatavalid, fbnextscanline, raygroup01, raygroupvalid01, busy01, raygroup10, raygroupvalid10, busy10, globalreset, rgData, rgAddr, rgWE, rgAddrValid, rgDone, rgResultData, rgResultReady, rgResultSource);
+
+    output rgwant_addr; 
+    wire rgwant_addr;
+    output rgwant_data; 
+    wire rgwant_data;
+    output rgread_ready; 
+    wire rgread_ready;
+    input rgaddr_ready; 
+    input rgdata_ready; 
+
+    input rgwant_read; 
+    input[63:0] rgdatain; 
+    output[63:0] rgdataout; 
+    wire[63:0] rgdataout;
+    input[17:0] rgaddrin; 
+    input[31:0] rgCont; 
+    output[31:0] rgStat; 
+    wire[31:0] rgStat;
+    input[31:0] rgCfgData; 
+    output rgwant_CfgData; 
+    wire rgwant_CfgData;
+    input rgCfgData_ready; 
+
+    input[63:0] tm3_sram_data_in; 
+    wire[63:0] tm3_sram_data_in;
+    output[63:0] tm3_sram_data_out; 
+    wire[63:0] tm3_sram_data_out;
+    wire[63:0] tm3_sram_data_xhdl0;
+    output[18:0] tm3_sram_addr; 
+    wire[18:0] tm3_sram_addr;
+    output[7:0] tm3_sram_we; 
+    wire[7:0] tm3_sram_we;
+    output[1:0] tm3_sram_oe; 
+    wire[1:0] tm3_sram_oe;
+    output tm3_sram_adsp; 
+    wire tm3_sram_adsp;
+    input clk; 
+
+    output[63:0] fbdata; 
+    wire[63:0] fbdata;
+    output fbdatavalid; 
+    wire fbdatavalid;
+    input fbnextscanline; 
+    output[1:0] raygroup01; 
+    wire[1:0] raygroup01;
+    output raygroupvalid01; 
+    wire raygroupvalid01;
+    input busy01; 
+    output[1:0] raygroup10; 
+    wire[1:0] raygroup10;
+
+    output raygroupvalid10; 
+    wire raygroupvalid10;
+    input busy10; 
+    input globalreset; 
+    output[31:0] rgData; 
+    wire[31:0] rgData;
+    output[3:0] rgAddr; 
+    wire[3:0] rgAddr;
+    output[2:0] rgWE; 
+    wire[2:0] rgWE;
+    output rgAddrValid; 
+    wire rgAddrValid;
+
+    input rgDone; 
+    input[31:0] rgResultData; 
+    input rgResultReady; 
+    input[1:0] rgResultSource; 
+
+    wire[2:0] statepeek2; 
+    wire as01; 
+    wire ack01; 
+
+    wire[3:0] addr01; 
+    wire[47:0] dir01; 
+    wire[47:0] dir; 
+    wire[47:0] sramdatal; 
+    wire wantDir; 
+    wire dirReady; 
+    wire dirReadyl; 
+    wire[14:0] address; 
+    wire[30:0] cyclecounter; 
+
+    wire nas01; 
+    wire nas10; 
+    wire go; 
+    reg page; 
+    wire[2:0] statepeekct; 
+    // result Signals
+    wire valid01; 
+    wire valid10; 
+    wire[15:0] id01a; 
+    wire[15:0] id01b; 
+    wire[15:0] id01c; 
+    wire[15:0] id10a; 
+
+    wire[15:0] id10b; 
+    wire[15:0] id10c; 
+    wire hit01a; 
+    wire hit01b; 
+    wire hit01c; 
+    wire hit10a; 
+    wire hit10b; 
+    wire hit10c; 
+    wire[7:0] u01a; 
+    wire[7:0] u01b; 
+    wire[7:0] u01c; 
+    wire[7:0] v01a; 
+
+    wire[7:0] v01b; 
+    wire[7:0] v01c; 
+    wire[7:0] u10a; 
+    wire[7:0] u10b; 
+    wire[7:0] u10c; 
+    wire[7:0] v10a; 
+    wire[7:0] v10b; 
+    wire[7:0] v10c; 
+    wire wantwriteback; 
+    wire writebackack; 
+    wire[63:0] writebackdata; 
+    wire[17:0] writebackaddr; 
+
+    wire[17:0] nextaddr01; 
+    // Shading Signals
+    wire[63:0] shadedata; 
+    wire[15:0] triID; 
+    wire wantshadedata; 
+    wire shadedataready; 
+    // CfgData Signals
+    wire[27:0] origx; 
+    wire[27:0] origy; 
+    wire[27:0] origz; 
+    wire[15:0] m11; 
+    wire[15:0] m12; 
+
+    wire[15:0] m13; 
+    wire[15:0] m21; 
+    wire[15:0] m22; 
+    wire[15:0] m23; 
+    wire[15:0] m31; 
+    wire[15:0] m32; 
+    wire[15:0] m33; 
+    wire[20:0] bkcolour; 
+    // Texture signals
+    wire[20:0] texinfo; 
+    wire[3:0] texaddr; 
+    wire[63:0] texel; 
+
+    wire[17:0] texeladdr; 
+    wire wanttexel; 
+    wire texelready; 
+    // Frame Buffer Read Signals
+    wire fbpage; 
+    // debug signals
+    wire wantcfg; 
+    wire debugglobalreset; 
+
+    assign rgwant_CfgData = wantcfg ;
+
+    onlyonecycle onlyeonecycleinst (rgCont[0], go, globalreset, clk); 
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          page <= 1'b1 ; // Reset to 1 such that first flip sets to 0
+       end
+       else
+
+       begin
+          page <= ~page ; 
+       end 
+    end 
+    assign fbpage = ~page ;
+
+    matmult matmultinst(sramdatal[47:32], sramdatal[31:16], sramdatal[15:0], m11, m12, m13, m21, m22, m23, m31, m32, m33, dir[47:32], dir[31:16], dir[15:0], clk); 
+
+    delay1x3 dir01delay(dirReady, dirReadyl, clk); 
+    rgconfigmemory ConfigMemoryInst (rgCfgData[31:28], rgCfgData[27:0], rgCfgData_ready, wantcfg, origx, origy, origz, m11, m12, m13, m21, m22, m23, m31, m32, m33, bkcolour, texinfo, globalreset, clk); 
+
+    rgsramcontroller sramcont (rgwant_addr, rgaddr_ready, rgaddrin, rgwant_data, rgdata_ready, rgdatain, rgwant_read, rgread_ready, rgdataout, dirReady, wantDir, sramdatal, address, wantwriteback, writebackack, writebackdata, writebackaddr, fbdata, fbnextscanline, fbdatavalid, fbpage, shadedata, triID, wantshadedata, shadedataready, texeladdr, texel, wanttexel, texelready, tm3_sram_data_in, tm3_sram_data_out, tm3_sram_addr, tm3_sram_we, tm3_sram_oe, tm3_sram_adsp, globalreset, clk);
+    raysend raysendinst (as01, ack01, addr01, dir01, origx, origy, origz, rgData, rgAddr, rgWE, rgAddrValid, rgDone, globalreset, clk, statepeek2); 
+
+    raygencont  raygencontinst(go, rgCont[15:1], rgStat[31], cyclecounter, nextaddr01, nas01, nas10, page, dirReadyl, wantDir, dir, address, as01, addr01, ack01, dir01, raygroup01, raygroupvalid01, busy01, raygroup10, raygroupvalid10, busy10, globalreset, clk, statepeekct); 
+    resultrecieve resultrecieveinst (valid01, valid10, id01a, id01b, id01c, id10a, id10b, id10c, hit01a, hit01b, hit01c, hit10a, hit10b, hit10c, u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, rgResultData, rgResultReady, rgResultSource, globalreset, clk); 
+    assign debugglobalreset = globalreset | go ;
+    resultwriter resultwriteinst (valid01, valid10, id01a, id01b, id01c, id10a, id10b, id10c, hit01a, hit01b, hit01c, hit10a, hit10b, hit10c, u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, nextaddr01, nas01, nas10, bkcolour, shadedata, triID, wantshadedata, shadedataready, texinfo, texaddr, texeladdr, texel, wanttexel, texelready, writebackdata, writebackaddr, wantwriteback, writebackack, debugglobalreset, clk);
+    assign rgStat[30:0] = cyclecounter ;
+ endmodule
+
+
+module delay1x3 (datain, dataout, clk);
+
+    input datain; 
+    output dataout; 
+    wire dataout;
+    input clk; 
+
+    reg buff0; 
+    reg buff1; 
+    reg buff2; 
+
+    assign dataout = buff2 ;
+
+    always @(posedge clk)
+    begin
+/* PAJ expanded for loop to hard definition the size of `depth */
+       buff0 <= datain ; 
+		buff1 <= buff0;
+		buff2 <= buff1;
+    end 
+ endmodule
+
+
+    
+
+    
+    
+ // A debugging circuit that allows a single cycle pulse to be 
+ // generated by through the ports package
+ module onlyonecycle (trigger, output_xhdl0, globalreset, clk);
+
+    input trigger; 
+    output output_xhdl0; 
+    reg output_xhdl0;
+    input globalreset; 
+    input clk; 
+
+    reg[1:0] state; 
+    reg[1:0] next_state; 
+    reg count; 
+    reg temp_count; 
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          state <= 0 ; 
+          count <= 0 ; 
+
+       end
+       else
+       begin
+          state <= next_state ; 
+		count <= temp_count;
+       end 
+    end 
+
+    always @(state or trigger or count)
+    begin
+       case (state)
+          0 :
+                   begin
+       				  output_xhdl0 = 1'b0 ; 
+                      if (trigger == 1'b1)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+                         temp_count = 1 - 1 ; 
+                   end
+          1 :
+                   begin
+                      output_xhdl0 = 1'b1 ; 
+                      if (count == 0)
+                      begin
+                         next_state = 2 ; 
+                      end
+                      else
+
+                      begin
+
+                         next_state = 1 ; 
+                      end 
+                         temp_count = count - 1 ; 
+                   end
+          2 :
+                   begin
+       				  output_xhdl0 = 1'b0 ; 
+                      if (trigger == 1'b0)
+                      begin
+                         next_state = 0 ; 
+                      end
+                      else
+                      begin
+                         next_state = 2 ; 
+
+                      end 
+                   end
+       endcase 
+    end 
+ endmodule
+
+module matmult (Ax, Ay, Az, m11, m12, m13, m21, m22, m23, m31, m32, m33, Cx, Cy, Cz, clk);
+
+    input[16 - 1:0] Ax; 
+    input[16 - 1:0] Ay; 
+    input[16 - 1:0] Az; 
+    input[16 - 1:0] m11; 
+    input[16 - 1:0] m12; 
+
+    input[16 - 1:0] m13; 
+    input[16 - 1:0] m21; 
+    input[16 - 1:0] m22; 
+    input[16 - 1:0] m23; 
+    input[16 - 1:0] m31; 
+    input[16 - 1:0] m32; 
+    input[16 - 1:0] m33; 
+    output[16 - 1:0] Cx; 
+    reg[16 - 1:0] Cx;
+    output[16 - 1:0] Cy; 
+    reg[16 - 1:0] Cy;
+    output[16 - 1:0] Cz; 
+
+    reg[16 - 1:0] Cz;
+    input clk; 
+
+    reg[16 + 16 - 1:0] am11; 
+    reg[16 + 16 - 1:0] am12; 
+    reg[16 + 16 - 1:0] am13; 
+    reg[16 + 16 - 1:0] am21; 
+    reg[16 + 16 - 1:0] am22; 
+    reg[16 + 16 - 1:0] am23; 
+    reg[16 + 16 - 1:0] am31; 
+    reg[16 + 16 - 1:0] am32; 
+    reg[16 + 16 - 1:0] am33; 
+
+
+    always @(posedge clk)
+    begin
+       am11 <= Ax * m11 ; 
+       am12 <= Ay * m12 ; 
+       am13 <= Az * m13 ; 
+       am21 <= Ax * m21 ; 
+       am22 <= Ay * m22 ; 
+       am23 <= Az * m23 ; 
+       am31 <= Ax * m31 ; 
+       am32 <= Ay * m32 ; 
+       am33 <= Az * m33 ; 
+
+       //      Cx <= (am11 + am12 + am13) (`widthA+`widthB-2 downto `widthB-1);
+       //      Cy <= (am21 + am22 + am23) (`widthA+`widthB-2 downto `widthB-1);
+       //      Cz <= (am31 + am32 + am33) (`widthA+`widthB-2 downto `widthB-1);
+       Cx <= (am11[16+16-2:16-1] + am12[16+16-2:16-1] + am13[16+16-2:16-1]) ; 
+       Cy <= (am21[16+16-2:16-1] + am22[16+16-2:16-1] + am23[16+16-2:16-1]); 
+       Cz <= (am31[16+16-2:16-1] + am32[16+16-2:16-1] + am33[16+16-2:16-1]) ;  
+    end 
+ endmodule
+
+    
+    
+
+module rgconfigmemory (CfgAddr, CfgData, CfgData_Ready, want_CfgData, origx, origy, origz, m11, m12, m13, m21, m22, m23, m31, m32, m33, bkcolour, texinfo, globalreset, clk);
+
+
+    input[3:0] CfgAddr; 
+    input[27:0] CfgData; 
+    input CfgData_Ready; 
+    output want_CfgData; 
+    reg want_CfgData;
+    output[27:0] origx; 
+    reg[27:0] origx;
+    output[27:0] origy; 
+    reg[27:0] origy;
+    output[27:0] origz; 
+    reg[27:0] origz;
+    output[15:0] m11; 
+    reg[15:0] m11;
+    output[15:0] m12; 
+    reg[15:0] m12;
+    output[15:0] m13; 
+    reg[15:0] m13;
+    output[15:0] m21; 
+    reg[15:0] m21;
+    output[15:0] m22; 
+    reg[15:0] m22;
+    output[15:0] m23; 
+    reg[15:0] m23;
+    output[15:0] m31; 
+    reg[15:0] m31;
+    output[15:0] m32; 
+    reg[15:0] m32;
+    output[15:0] m33; 
+    reg[15:0] m33;
+    output[20:0] bkcolour; 
+    reg[20:0] bkcolour;
+    output[20:0] texinfo; 
+
+    wire[20:0] texinfo;
+    input globalreset; 
+    input clk; 
+
+    reg state; 
+    reg next_state; 
+    wire we; 
+
+    reg[27:0] temp_origx;
+    reg[27:0] temp_origy;
+    reg[27:0] temp_origz;
+    reg[15:0] temp_m11;
+    reg[15:0] temp_m12;
+    reg[15:0] temp_m13;
+    reg[15:0] temp_m21;
+    reg[15:0] temp_m22;
+    reg[15:0] temp_m23;
+    reg[15:0] temp_m31;
+    reg[15:0] temp_m32;
+    reg[15:0] temp_m33;
+    reg[20:0] temp_bkcolour;
+
+    // <<X-HDL>> Can't find translated component 'spram'. Module name may not match
+    spram21x4 spraminst(we, texinfo, CfgData[20:0], clk); 
+    assign we = ((CfgData_Ready == 1'b1) & (CfgAddr == 4'b1110)) ? 1'b1 : 1'b0 ;
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          state <= 0 ; 
+          origx <= 0;
+          origy <= 0;
+
+          origz <= 0;
+          m11 <= 1;
+          m12 <= 0;
+          m13 <= 0;
+          m21 <= 0;
+          m22 <= 1;
+          m23 <= 0;
+          m31 <= 0;
+          m32 <= 0;
+         m33 <= 1;
+          bkcolour <= 0;
+       end
+       else
+       begin
+          state <= next_state ; 
+          origx <= temp_origx;
+          origy <= temp_origy;
+          origz <= temp_origz;
+          m11 <= temp_m11;
+          m12 <= temp_m12;
+          m13 <= temp_m13;
+          m21 <= temp_m21;
+          m22 <= temp_m22;
+          m23 <= temp_m23;
+          m31 <= temp_m31;
+          m32 <= temp_m32;
+         m33 <= temp_m33;
+          bkcolour <= bkcolour;
+       end 
+    end 
+
+    always @(state or CfgData_Ready)
+    begin
+       case (state)
+          0 :
+                   begin
+                      want_CfgData = 1'b1 ; 
+                      if (CfgData_Ready == 1'b1)
+                      begin
+                         next_state = 1 ; 
+                      end
+
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+
+              if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0001))
+                        begin
+											temp_origx = CfgData ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0010))
+                        begin
+                                           temp_origy = CfgData ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0011))
+                        begin
+                                           temp_origz = CfgData ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0100))
+                        begin
+                                           temp_m11 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0101))
+                        begin
+                                           temp_m12 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0110))
+                        begin
+                                           temp_m13 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b0111))
+                        begin
+                                           temp_m21 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1000))
+                        begin
+                                           temp_m22 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1001))
+                        begin
+                                           temp_m23 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1010))
+                        begin
+                                           temp_m31 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1011))
+                        begin
+                                           temp_m32 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1100))
+                        begin
+                                           temp_m33 = CfgData[15:0] ; 
+						end
+                        else if ((CfgData_Ready == 1'b1) && (CfgAddr == 4'b1101))
+                        begin
+                                           temp_bkcolour = CfgData[20:0] ; 
+						end
+                   end
+          1 :
+                   begin
+                      want_CfgData = 1'b0 ; 
+                      if (CfgData_Ready == 1'b0)
+                      begin
+                         next_state = 0 ; 
+                      end
+
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+                   end
+       endcase 
+    end 
+ endmodule
+
+    
+    
+ module spram21x4 (we, dataout, datain, clk);
+
+    input we; 
+    output[21 - 1:0] dataout; 
+    wire[21 - 1:0] dataout;
+    input[21 - 1:0] datain; 
+    input clk; 
+
+	reg [7:0] addr;
+	
+	always @ (posedge clk)
+	begin
+	 addr[0] <= we;
+	 addr [1] <= addr[0];
+	 addr [2] <= addr[1];
+	 addr [3] <= addr[2];
+	 addr [4] <= addr[3];
+	 addr [5] <= addr[4];
+	 addr [6] <= addr[5];
+	 addr [7] <= addr[6];
+	 end
+//changed to odin 2 ram specifications
+
+defparam new_ram.ADDR_WIDTH = 8;
+defparam new_ram.DATA_WIDTH = 21;
+single_port_ram new_ram(
+  .clk (clk),
+  .we(we),
+  .data(datain),
+  .out(dataout),
+  .addr(addr)
+  );
+  
+  
+ endmodule
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+module rgsramcontroller (want_addr, addr_ready, addrin, want_data, data_ready, datain, want_read, read_ready, dataout, dirReady, wantDir, sramdatal, addr, wantwriteback, writebackack, writebackdata, writebackaddr, fbdata, fbnextscanline, fbdatavalid, fbpage, shadedata, triID, wantshadedata, shadedataready, texeladdr, texel, wanttexel, texelready, tm3_sram_data_in, tm3_sram_data_out, tm3_sram_addr, tm3_sram_we, tm3_sram_oe, tm3_sram_adsp, globalreset, clk);
+
+    output want_addr; 
+    reg want_addr;
+    input addr_ready; 
+    input[17:0] addrin; 
+    output want_data; 
+    reg want_data;
+    input data_ready; 
+    input[63:0] datain; 
+    input want_read; 
+    output read_ready; 
+
+    reg read_ready;
+    output[63:0] dataout; 
+    wire[63:0] dataout;
+    output dirReady; 
+    reg dirReady;
+    input wantDir; 
+    output[47:0] sramdatal; 
+    reg[47:0] sramdatal;
+    output[14:0] addr; 
+    wire[14:0] addr;
+    input wantwriteback; 
+    output writebackack; 
+
+    reg writebackack;
+    input[63:0] writebackdata; 
+    input[17:0] writebackaddr; 
+    output[63:0] fbdata; 
+    reg[63:0] fbdata;
+    input fbnextscanline; 
+    output fbdatavalid; 
+    reg fbdatavalid;
+    input fbpage; 
+    output[63:0] shadedata; 
+    wire[63:0] shadedata;
+    input[15:0] triID; 
+
+    input wantshadedata; 
+    output shadedataready; 
+    reg shadedataready;
+    input[17:0] texeladdr; 
+    output[63:0] texel; 
+    wire[63:0] texel;
+    input wanttexel; 
+    output texelready; 
+    reg texelready;
+    input[63:0] tm3_sram_data_in; 
+    wire[63:0] tm3_sram_data_in;
+    output[63:0] tm3_sram_data_out; 
+    wire[63:0] tm3_sram_data_out;
+    reg[63:0] tm3_sram_data_xhdl0;
+
+    output[18:0] tm3_sram_addr; 
+    reg[18:0] tm3_sram_addr;
+    output[7:0] tm3_sram_we; 
+    reg[7:0] tm3_sram_we;
+    output[1:0] tm3_sram_oe; 
+    reg[1:0] tm3_sram_oe;
+    output tm3_sram_adsp; 
+    reg tm3_sram_adsp;
+    input globalreset; 
+    input clk; 
+
+    reg[3:0] state; 
+    reg[3:0] next_state; 
+    reg[17:0] waddress; 
+    reg[14:0] faddress; 
+    reg[6:0] fcount; 
+    reg fbdatavalidl; 
+
+    reg[17:0] temp_waddress; 
+    reg[14:0] temp_faddress; 
+    reg[6:0] temp_fcount; 
+    reg temp_fbdatavalidl; 
+    reg temp_texelready;
+    reg temp_shadedataready;
+
+    assign tm3_sram_data_out = tm3_sram_data_xhdl0;
+
+    assign dataout = tm3_sram_data_in ;
+    assign addr = tm3_sram_data_in[62:48] ;
+    assign shadedata = tm3_sram_data_in ;
+    assign texel = tm3_sram_data_in ;
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+
+          state <= 0 ; 
+          waddress <= 0;
+          faddress <= 0;
+          fcount <= 7'b1101011 ; 
+          fbdatavalid <= 1'b0 ; 
+          fbdatavalidl <= 1'b0 ; 
+          shadedataready <= 1'b0 ; 
+          texelready <= 1'b0 ; 
+          sramdatal <= 0;
+          fbdata <= 0;
+       end
+       else
+
+       begin
+          state <= next_state ; 
+          sramdatal <= tm3_sram_data_in[47:0] ; 
+          fbdata <= tm3_sram_data_in ; 
+          fbdatavalid <= fbdatavalidl ; 
+
+fbdatavalidl <= temp_fbdatavalidl;
+texelready <= temp_texelready;
+shadedataready <= temp_shadedataready;
+fcount <= temp_fcount;
+faddress <= temp_faddress;
+waddress <= temp_waddress;
+
+       end 
+    end 
+
+    always @(state or addr_ready or data_ready or waddress or datain or wantDir or 
+             want_read or wantwriteback or writebackdata or writebackaddr or 
+             fcount or fbpage or faddress or fbnextscanline or triID or wantshadedata or 
+             wanttexel or texeladdr)
+
+    begin
+       case (state)
+
+          0 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      if (addr_ready == 1'b1)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else if (want_read == 1'b1)
+                      begin
+                         next_state = 2 ; 
+                      end
+                      else if (data_ready == 1'b1)
+                      begin
+
+                         next_state = 3 ; 
+                      end
+                      else if (wantDir == 1'b1)
+                      begin
+                         next_state = 5 ; 
+                      end
+                      else if (wantwriteback == 1'b1)
+                      begin
+                         next_state = 6 ; 
+                      end
+                      else if (wantshadedata == 1'b1)
+                      begin
+
+                         next_state = 9 ; 
+                      end
+                      else if (wanttexel == 1'b1)
+                      begin
+                         next_state = 10 ; 
+                      end
+                      else if (fcount != 0)
+                      begin
+                         next_state = 7 ; 
+                      end
+                      else if (fbnextscanline == 1'b1)
+                      begin
+
+                         next_state = 8 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         if (addr_ready == 1'b1)
+
+                         begin
+                            temp_waddress = addrin ; 
+                         end 
+
+                   end
+          1 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      want_addr = 1'b0 ; 
+                      if (addr_ready == 1'b0)
+                      begin
+                         next_state = 0 ; 
+
+                      end
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+                   end
+          2 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+
+                      read_ready = 1'b0 ; 
+                      if (want_read == 1'b0)
+                      begin
+                         next_state = 0 ; 
+                      end
+                      else
+                      begin
+                         next_state = 2 ; 
+                      end 
+
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         if (want_read == 1'b0)
+                         begin
+
+                            temp_waddress = waddress + 1 ; 
+                         end 
+
+                   end
+          3 :
+                   begin
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      tm3_sram_data_xhdl0 = datain ; 
+                      tm3_sram_we = 8'b00000000 ; 
+
+
+                   tm3_sram_oe = 2'b11 ; 
+                      tm3_sram_adsp = 1'b0 ; 
+                      want_data = 1'b0 ; 
+                      next_state = 4 ; 
+
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         temp_waddress = waddress + 1 ; 
+
+                   end
+          4 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      if (data_ready == 1'b0)
+                      begin
+
+                         next_state = 0 ; 
+                      end
+                      else
+                      begin
+                         next_state = 4 ; 
+                      end 
+                      want_data = 1'b0 ; 
+                   end
+
+          5 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       writebackack = 1'b0 ; 
+
+                     dirReady = 1'b1 ; 
+                      if (wantDir == 1'b0)
+                      begin
+                         next_state = 0 ; 
+
+                      end
+                      else
+                      begin
+                         next_state = 5 ; 
+                      end 
+
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         if (wantDir == 1'b0)
+                         begin
+                            temp_waddress = waddress + 1 ; 
+                         end 
+
+                   end
+          6 :
+                   begin
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+
+                      tm3_sram_data_xhdl0 = writebackdata ; 
+                      tm3_sram_we = 8'b00000000 ; 
+                      tm3_sram_oe = 2'b11 ; 
+                      tm3_sram_adsp = 1'b0 ; 
+                      tm3_sram_addr = {1'b0, writebackaddr} ; 
+                      writebackack = 1'b1 ; 
+                      next_state = 0 ; 
+                   end
+
+          7 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      tm3_sram_addr = {3'b011, fbpage, faddress} ; 
+                      if ((fcount == 1) | (addr_ready == 1'b1) | (want_read == 1'b1) | (data_ready == 1'b1) | (wantDir == 1'b1) | (wantwriteback == 1'b1))
+                      begin
+                         next_state = 0 ; 
+
+                      end
+                      else
+                      begin
+                         next_state = 7 ; 
+                      end 
+
+
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         temp_fbdatavalidl = 1'b1 ; 
+                         if (fcount != 0)
+                         begin
+                            temp_faddress = faddress + 1 ; 
+                            temp_fcount = fcount - 1 ; 
+                         end 
+
+                   end
+          8 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       tm3_sram_addr = {1'b0, waddress} ; 
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      next_state = 7 ; 
+
+				   				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         temp_fcount = 7'b1101011 ; 
+                         if (faddress == 25680)
+                         begin
+                            temp_faddress = 0;
+                         end 
+                   end
+          9 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      tm3_sram_addr = {3'b010, triID} ; 
+                      next_state = 0 ; 
+
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_texelready = 1'b0 ; 
+                         temp_shadedataready = 1'b1 ; 
+                   end
+
+          10 :
+                   begin
+				       tm3_sram_we = 8'b11111111 ; 
+				       tm3_sram_oe = 2'b01 ; 
+				       tm3_sram_adsp = 1'b0 ; 
+				       tm3_sram_data_xhdl0 = 0;
+				       want_addr = 1'b1 ; 
+				       want_data = 1'b1 ; 
+				       read_ready = 1'b1 ; 
+				       dirReady = 1'b0 ; 
+				       writebackack = 1'b0 ; 
+                      tm3_sram_addr = {1'b0, texeladdr} ; 
+                      next_state = 0 ; 
+
+				          temp_fbdatavalidl = 1'b0 ; 
+				          temp_shadedataready = 1'b0 ; 
+                         temp_texelready = 1'b1 ; 
+                   end
+       endcase 
+    end 
+ endmodule
+
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+ module raysend (as, ack, addr, dir, origx, origy, origz, rgData, rgAddr, rgWE, rgAddrValid, rgDone, globalreset, clk, statepeek);
+
+    input as; 
+    output ack; 
+    reg ack;
+    input[3:0] addr; 
+    input[47:0] dir; 
+    input[27:0] origx; 
+    input[27:0] origy; 
+    input[27:0] origz; 
+    output[31:0] rgData; 
+    reg[31:0] rgData;
+
+    output[3:0] rgAddr; 
+    reg[3:0] rgAddr;
+    output[2:0] rgWE; 
+    reg[2:0] rgWE;
+    output rgAddrValid; 
+    reg rgAddrValid;
+    input rgDone; 
+    input globalreset; 
+    input clk; 
+    output[2:0] statepeek; 
+    reg[2:0] statepeek;
+
+    reg[3:0] state; 
+    reg[3:0] next_state; 
+
+
+
+    reg[31:0] temp_rgData;
+    reg[2:0] temp_rgWE; 
+    reg temp_rgAddrValid;
+    reg temp_ack;
+    reg[3:0] temp_rgAddr; 
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          state <= 0 ; 
+          ack <= 1'b0 ; 
+          rgWE <= 3'b000 ; 
+          rgData <= 0;
+          rgAddrValid <= 1'b0 ; 
+          rgAddr <= 0;
+       end
+       else
+       begin
+          state <= next_state ; 
+
+rgData <= temp_rgData;
+rgWE <= temp_rgWE;
+rgAddrValid <= temp_rgAddrValid;
+ack <= temp_ack;
+rgAddr <= temp_rgAddr;
+
+       end 
+    end 
+
+    always @(state or ack or as or rgDone)
+    begin
+
+       case (state)
+          0 :
+                   begin
+                      if ((as == 1'b1) & (ack == 1'b0))
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+                      statepeek = 3'b001 ; 
+
+                         if ((as == 1'b1) & (ack == 1'b0))
+                         begin
+                            temp_rgData = {4'b0000, origx} ; 
+                            temp_rgWE = 3'b001 ; 
+                            temp_rgAddrValid = 1'b1 ; 
+                            temp_rgAddr = addr ; 
+                         end 
+                         if (as == 1'b0 & ack == 1'b1)
+                         begin
+                            temp_ack = 1'b0 ; 
+                         end 
+
+                   end
+          1 :
+                   begin
+                      if (rgDone == 1'b1)
+                      begin
+                         next_state = 6 ; 
+                      end
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+                      statepeek = 3'b010 ; 
+
+                         if (rgDone == 1'b1)
+                         begin
+                            temp_rgAddrValid = 1'b0 ; 
+                         end 
+
+                   end
+          2 :
+                   begin
+                      if (rgDone == 1'b1)
+                      begin
+                         next_state = 7 ; 
+                      end
+                      else
+                      begin
+                         next_state = 2 ; 
+                      end 
+                      statepeek = 3'b011 ; 
+
+                         if (rgDone == 1'b1)
+                         begin
+                            temp_rgAddrValid = 1'b0 ; 
+                         end 
+
+                   end
+           3 :
+                   begin
+                      if (rgDone == 1'b1)
+                      begin
+                         next_state = 8 ; 
+                      end
+                      else
+                      begin
+                         next_state = 3 ; 
+                      end 
+                      statepeek = 3'b100 ; 
+
+                         if (rgDone == 1'b1)
+                         begin
+                            temp_rgAddrValid = 1'b0 ; 
+                         end 
+
+                   end
+         4 :
+                   begin
+                      if (rgDone == 1'b1)
+                       begin
+                         next_state = 9 ; 
+                      end
+                      else
+                      begin
+                         next_state = 4 ; 
+                      end 
+                      statepeek = 3'b101 ; 
+
+                         if (rgDone == 1'b1)
+                         begin
+                            temp_rgAddrValid = 1'b0 ; 
+                         end 
+                   end
+
+          5 :
+                   begin
+                      if (rgDone == 1'b1)
+                      begin
+                         next_state = 0 ; 
+                      end
+                      else
+                      begin
+                         next_state = 5 ; 
+                      end 
+                      statepeek = 3'b110 ; 
+
+                         temp_ack = 1'b1 ; 
+                         if (rgDone == 1'b1)
+                         begin
+                            temp_rgAddrValid = 1'b0 ; 
+                         end 
+
+                   end
+
+          6 :
+                   begin
+                      next_state = 2 ; 
+
+                         temp_rgData = {4'b0000, origy} ; 
+                         temp_rgWE = 3'b010 ; 
+                         temp_rgAddrValid = 1'b1 ; 
+
+                   end
+          7 :
+                   begin
+                      next_state = 3 ; 
+
+                         temp_rgData = {4'b0000, origz} ; 
+                         temp_rgWE = 3'b011 ; 
+                         temp_rgAddrValid = 1'b1 ; 
+                   end
+          8 :
+                   begin
+                      next_state = 4 ; 
+
+                         temp_rgData = {dir[31:16], dir[47:32]} ; 
+                         temp_rgWE = 3'b100 ; 
+                         temp_rgAddrValid = 1'b1 ; 
+                   end
+           9 :
+                   begin
+                      next_state = 5 ; 
+
+                         temp_rgData = {16'b0000000000000000, dir[15:0]} ; 
+                          temp_rgWE = 3'b101 ; 
+                         temp_rgAddrValid = 1'b1 ; 
+                   end
+       endcase 
+    end 
+ endmodule
+
+    
+    
+    
+    
+    
+
+ module raygencont (go, initcount, busyout, cycles, nextaddr, nas0, nas1, page, dirReady, wantDir, dirIn, addrIn, as, addr, ack, dir, raygroup0, raygroupvalid0, busy0, raygroup1, raygroupvalid1, busy1, globalreset, clk, statepeek);
+
+    input go; 
+    input[14:0] initcount; 
+    output busyout; 
+    wire busyout;
+    reg temp_busyout;
+    output[30:0] cycles; 
+    reg[30:0] cycles;
+    output[17:0] nextaddr; 
+    wire[17:0] nextaddr;
+    output nas0; 
+
+    wire nas0;
+    reg temp_nas0;
+    output nas1; 
+    wire nas1;
+    reg temp_nas1;
+    input page; 
+    input dirReady; 
+    output wantDir; 
+    reg wantDir;
+    input[47:0] dirIn; 
+    input[14:0] addrIn; 
+    output as; 
+    reg as;
+    output[3:0] addr; 
+
+    reg[3:0] addr;
+    input ack; 
+    output[47:0] dir; 
+    reg[47:0] dir;
+    output[1:0] raygroup0; 
+    wire[1:0] raygroup0;
+    output raygroupvalid0; 
+    reg raygroupvalid0;
+    input busy0; 
+    output[1:0] raygroup1; 
+    wire[1:0] raygroup1;
+    output raygroupvalid1; 
+
+    reg raygroupvalid1;
+    input busy1; 
+    input globalreset; 
+    input clk; 
+    output[2:0] statepeek; 
+    reg[2:0] statepeek;
+
+
+    reg[2:0] state; 
+    reg[2:0] next_state; 
+    reg[14:0] count; 
+    reg first; 
+    reg[17:0] destaddr; 
+    wire[1:0] busy; 
+    reg[1:0] loaded; 
+    reg[1:0] groupID; 
+    reg active; 
+
+    reg[47:0] temp_dir;
+    reg[30:0] temp_cycles;
+    reg[1:0] temp_addr;
+    reg[1:0] temp_loaded; 
+    reg[1:0] temp_groupID; 
+    reg[14:0] temp_count; 
+    reg temp_active; 
+    reg temp_raygroupvalid1;
+    reg temp_raygroupvalid0;
+
+    assign busy = {busy1, busy0} ;
+
+    always @(posedge clk)
+    begin
+
+       if (globalreset == 1'b1)
+
+       begin
+          state <= 0 ; 
+          cycles <= 0;
+          dir <= 0;
+          addr[1:0] <= 2'b00 ; 
+          groupID <= 2'b00 ; 
+          count <= 0;
+          first <= 1'b0 ; 
+          destaddr <= 0;
+          raygroupvalid0 <= 1'b0 ; 
+          raygroupvalid1 <= 1'b0 ; 
+          loaded <= 2'b00 ; 
+
+          active <= 1'b0 ; 
+       end
+       else
+       begin
+    	addr[3:2] <= (active == 1'b0) ? {1'b0, groupID[0]} : {1'b1, groupID[1]} ;
+	addr[1:0] <= temp_addr[1:0];
+        state <= next_state ; 
+
+	dir <= temp_dir;
+	cycles <= temp_cycles;
+	loaded <= temp_loaded;	
+	groupID <= temp_groupID;
+	count <= temp_count;
+	active <= temp_active;
+	raygroupvalid0 <= temp_raygroupvalid0;
+	raygroupvalid1 <= temp_raygroupvalid1;
+
+       end 
+    end 
+
+    assign raygroup0 = {1'b0, groupID[0]} ;
+    assign raygroup1 = {1'b1, groupID[1]} ;
+    assign nextaddr = {2'b11, page, addrIn} ;
+    assign busyout = temp_busyout;
+    assign nas0 = temp_nas0;
+    assign nas1 = temp_nas1;
+
+    always @(state or go or ack or busy or dirReady or addr or count or loaded)
+    begin
+       case (state)
+          0 :
+                   begin
+       				as = 1'b0 ; 
+       				wantDir = 1'b0 ; 
+                      if (go == 1'b1)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+                      statepeek = 3'b001 ; 
+						temp_busyout = 1'b0;
+						temp_nas0 = 1'b0;
+						temp_nas1 = 1'b0;
+
+
+                         if (go == 1'b1)
+                         begin
+                            temp_cycles = 0;
+                         end 
+                         temp_addr[1:0] = 2'b00 ; 
+                         temp_loaded = 2'b00 ; 
+                         temp_groupID = 2'b00 ; 
+                         temp_count = initcount ; 
+                         temp_active = 1'b0 ; 
+
+                   end
+          1 :
+                   begin
+                      as = dirReady ; 
+                      wantDir = 1'b1 ; 
+                      if (dirReady == 1'b1)
+                      begin
+                         next_state = 2 ; 
+                      end
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+                     statepeek = 3'b010 ; 
+						temp_busyout = 1'b1;
+    				if (addr[1:0] == 2'b00 & dirReady == 1'b1 & active == 1'b0) 
+					begin
+						 temp_nas0 = 1'b1;
+						 temp_nas1 = 1'b1;
+					end
+
+                         temp_dir = dirIn ; 
+                         if (dirReady == 1'b1 & addr[1:0] == 2'b10)
+                         begin
+                            if (active == 1'b0)
+                            begin
+                               temp_loaded[0] = 1'b1 ; 
+                            end
+                            else
+                            begin
+                               temp_loaded[1] = 1'b1 ; 
+                            end 
+                         end 
+             temp_cycles = cycles + 1 ; 
+
+
+                   end
+          2 :
+                   begin
+                      wantDir = 1'b0 ; 
+                      as = 1'b1 ; 
+                      if ((ack == 1'b1) & (addr[1:0] != 2'b10))
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else if (ack == 1'b1)
+                      begin
+                         if ((loaded[0]) == 1'b1 & (busy[0]) == 1'b0)
+                         begin
+                            next_state = 3 ; 
+                         end
+                         else if ((loaded[1]) == 1'b1 & (busy[1]) == 1'b0)
+                         begin
+                            next_state = 4 ; 
+                         end
+                         else if (loaded != 2'b11)
+                         begin
+
+                            next_state = 1 ; 
+                         end
+                         else
+                         begin
+                            next_state = 2 ; 
+                         end 
+                      end
+                      else
+                      begin
+                         next_state = 2 ; 
+                      end 
+                      statepeek = 3'b011 ; 
+						temp_busyout = 1'b1;
+						temp_nas0 = 1'b0;
+						temp_nas1 = 1'b0;
+
+                         if ((ack == 1'b1) & (addr[1:0] != 2'b10))
+                         begin
+                            temp_addr[1:0] = addr[1:0] + 2'b01 ; 
+                         end 
+                         else if ((ack == 1'b1) & addr[1:0] == 2'b10)
+                         begin
+                            if ((loaded[0]) == 1'b1 & (busy[0]) == 1'b0)
+                            begin
+                               temp_raygroupvalid0 = 1'b1 ; 
+                            end
+                            else if ((loaded[1]) == 1'b1 & (busy[1]) == 1'b0)
+                            begin
+
+                               temp_raygroupvalid1 = 1'b1 ; 
+                            end
+                            else if ((loaded[0]) == 1'b0)
+                            begin
+                               temp_active = 1'b0 ; 
+                               temp_addr[1:0] = 2'b00 ; 
+                            end
+                            else if ((loaded[1]) == 1'b0)
+                            begin
+                               temp_active = 1'b1 ; 
+                               temp_addr[1:0] = 2'b00 ; 
+                            end 
+                         end 
+
+             temp_cycles = cycles + 1 ; 
+                   end
+          4 :
+                   begin
+                      if ((busy[1]) == 1'b0)
+                      begin
+                         next_state = 4 ; 
+                      end
+                      else if ((loaded[0]) == 1'b1 & (busy[0]) == 1'b0)
+                      begin
+                         next_state = 3 ; 
+                      end
+                      else if (count > 0)
+                      begin
+
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+                      statepeek = 3'b101 ; 
+						temp_busyout = 1'b1;
+						temp_nas0 = 1'b0;
+						temp_nas1 = 1'b0;
+
+                     if ((busy[1]) == 1'b1)
+                         begin
+                            temp_groupID[1] = ~groupID[1] ; 
+                            temp_raygroupvalid1 = 1'b0 ; 
+                            temp_count = count - 1 ; 
+                            if ((loaded[0]) == 1'b1 & (busy[0]) == 1'b0)
+                            begin
+                               temp_raygroupvalid0 = 1'b1 ; 
+                            end
+
+                            else if ((loaded[0]) == 1'b0)
+                            begin
+                               temp_active = 1'b0 ; 
+                            end
+                            else
+                            begin
+                               temp_active = 1'b1 ; 
+                            end 
+                         end 
+                         temp_loaded[1] = 1'b0 ; 
+                         temp_addr[1:0] = 2'b00 ; 
+
+             temp_cycles = cycles + 1 ; 
+                   end
+          3 :
+                   begin
+                      if ((busy[0]) == 1'b0)
+                      begin
+                         next_state = 3 ; 
+
+                      end
+                      else if ((loaded[1]) == 1'b1 & (busy[1]) == 1'b0)
+                      begin
+                         next_state = 4 ; 
+                      end
+                      else if (count > 0)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+
+                      end 
+                      statepeek = 3'b100 ; 
+						temp_busyout = 1'b1;
+						temp_nas0 = 1'b0;
+						temp_nas1 = 1'b0;
+
+                         if ((busy[0]) == 1'b1)
+                         begin
+                            temp_groupID[0] = ~groupID[0] ; 
+                            temp_raygroupvalid0 = 1'b0 ; 
+                            temp_count = count - 1 ; 
+                            if ((loaded[1]) == 1'b1 & (busy[1]) == 1'b0)
+                            begin
+                               temp_raygroupvalid1 = 1'b1 ; 
+
+                            end
+                            else if ((loaded[1]) == 1'b0)
+                            begin
+                               temp_active = 1'b1 ; 
+                            end
+                            else
+                            begin
+                               temp_active = 1'b0 ; 
+                            end 
+                         end 
+                         temp_loaded[0] = 1'b0 ; 
+                         temp_addr[1:0] = 2'b00 ; 
+
+
+             temp_cycles = cycles + 1 ; 
+                   end
+       endcase 
+    end 
+ endmodule
+    
+    
+    
+    
+    
+    
+    
+
+ module resultrecieve (valid01, valid10, id01a, id01b, id01c, id10a, id10b, id10c, hit01a, hit01b, hit01c, hit10a, hit10b, hit10c, u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, rgResultData, rgResultReady, rgResultSource, globalreset, clk);
+
+    output valid01; 
+    reg valid01;
+    output valid10; 
+    reg valid10;
+    output[15:0] id01a; 
+    reg[15:0] id01a;
+    output[15:0] id01b; 
+    reg[15:0] id01b;
+    output[15:0] id01c; 
+    reg[15:0] id01c;
+
+    output[15:0] id10a; 
+    reg[15:0] id10a;
+    output[15:0] id10b; 
+    reg[15:0] id10b;
+    output[15:0] id10c; 
+    reg[15:0] id10c;
+    output hit01a; 
+    reg hit01a;
+    output hit01b; 
+    reg hit01b;
+    output hit01c; 
+    reg hit01c;
+
+    output hit10a; 
+    reg hit10a;
+    output hit10b; 
+    reg hit10b;
+    output hit10c; 
+    reg hit10c;
+    output[7:0] u01a; 
+    reg[7:0] u01a;
+    output[7:0] u01b; 
+    reg[7:0] u01b;
+    output[7:0] u01c; 
+    reg[7:0] u01c;
+
+    output[7:0] v01a; 
+    reg[7:0] v01a;
+    output[7:0] v01b; 
+    reg[7:0] v01b;
+    output[7:0] v01c; 
+    reg[7:0] v01c;
+    output[7:0] u10a; 
+    reg[7:0] u10a;
+    output[7:0] u10b; 
+    reg[7:0] u10b;
+    output[7:0] u10c; 
+    reg[7:0] u10c;
+
+    output[7:0] v10a; 
+    reg[7:0] v10a;
+    output[7:0] v10b; 
+    reg[7:0] v10b;
+    output[7:0] v10c; 
+    reg[7:0] v10c;
+    input[31:0] rgResultData; 
+    input rgResultReady; 
+    input[1:0] rgResultSource; 
+    input globalreset; 
+    input clk; 
+
+    reg temp_valid01;
+    reg temp_valid10;
+    reg[15:0] temp_id01a;
+    reg[15:0] temp_id01b;
+    reg[15:0] temp_id01c;
+    reg[15:0] temp_id10a;
+    reg[15:0] temp_id10b;
+    reg[15:0] temp_id10c;
+    reg temp_hit01a;
+    reg temp_hit01b;
+    reg temp_hit01c;
+    reg temp_hit10a;
+    reg temp_hit10b;
+    reg temp_hit10c;
+    reg[7:0] temp_u01a;
+    reg[7:0] temp_u01b;
+    reg[7:0] temp_u01c;
+    reg[7:0] temp_v01a;
+    reg[7:0] temp_v01b;
+    reg[7:0] temp_v01c;
+    reg[7:0] temp_u10a;
+    reg[7:0] temp_u10b;
+    reg[7:0] temp_u10c;
+    reg[7:0] temp_v10a;
+    reg[7:0] temp_v10b;
+    reg[7:0] temp_v10c;
+
+
+    reg[2:0] state; 
+    reg[2:0] next_state; 
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          state <= 0 ; 
+          valid01 <= 1'b0 ; 
+          valid10 <= 1'b0 ; 
+          hit01a <= 1'b0 ; 
+          hit01b <= 1'b0 ; 
+          hit01c <= 1'b0 ; 
+          hit10a <= 1'b0 ; 
+          hit10b <= 1'b0 ; 
+          hit10c <= 1'b0 ; 
+          id01a <= 0;
+
+          id01b <= 0;
+          id01c <= 0;
+          id10a <= 0;
+          id10b <= 0;
+          id10c <= 0;
+          u01a <= 0;
+          u01b <= 0;
+          u01c <= 0;
+          v01a <= 0;
+          v01b <= 0;
+          v01c <= 0;
+          u10a <= 0;
+
+          u10b <= 0;
+          u10c <= 0;
+          v10a <= 0;
+          v10b <= 0;
+          v10c <= 0;
+       end
+       else
+       begin
+          state <= next_state ; 
+
+valid01 <= temp_valid01;
+valid10 <= temp_valid10;
+id01a <= temp_id01a;
+id01b <= temp_id01b;
+id01c <= temp_id01c;
+hit01a <= temp_hit01a;
+hit01b <= temp_hit01b;
+hit01c <= temp_hit01c;
+u01a <= temp_u01a;
+u01b <= temp_u01b;
+u01c <= temp_u01c;
+u10a <= temp_u10a;
+u10b <= temp_u10b;
+u10c <= temp_u10c;
+v01a <= temp_v01a;
+v01b <= temp_v01b;
+v01c <= temp_v01c;
+v10a <= temp_v10a;
+v10b <= temp_v10b;
+v10c <= temp_v10c;
+hit10a <= temp_hit10a;
+hit10b <= temp_hit10b;
+hit10c <= temp_hit10c;
+       end 
+    end 
+
+
+    always @(state or rgResultReady or rgResultSource)
+    begin
+       case (state)
+          0 :
+                   begin
+                      if (rgResultReady == 1'b1 & rgResultSource == 2'b01)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else if (rgResultReady == 1'b1 & rgResultSource == 2'b10)
+                      begin
+
+                         next_state = 4 ; 
+                      end
+                      else
+                      begin
+                         next_state = 0 ; 
+                      end 
+
+
+			temp_valid01 = 1'b0 ; 
+				          temp_valid10 = 1'b0 ; 
+                         if (rgResultReady == 1'b1 & rgResultSource == 2'b01)
+                         begin
+                            temp_id01a = rgResultData[31:16] ; 
+                            temp_id01b = rgResultData[15:0] ; 
+                         end
+                         else if (rgResultReady == 1'b1 & rgResultSource == 2'b10)
+                         begin
+                            temp_id10a = rgResultData[31:16] ; 
+                            temp_id10b = rgResultData[15:0] ; 
+                         end 
+
+                   end
+
+          1 :
+                   begin
+                      next_state = 2 ; 
+
+			temp_valid01 = 1'b0 ; 
+				          temp_valid10 = 1'b0 ; 
+                         temp_id01c = rgResultData[15:0] ; 
+                         temp_hit01a = rgResultData[18] ; 
+                         temp_hit01b = rgResultData[17] ; 
+                         temp_hit01c = rgResultData[16] ; 
+
+                   end
+          2 :
+
+                   begin
+                      next_state = 3 ; 
+
+			temp_valid01 = 1'b0 ; 
+				          temp_valid10 = 1'b0 ; 
+                         temp_u01a = rgResultData[23:16] ; 
+                         temp_u01b = rgResultData[15:8] ; 
+                         temp_u01c = rgResultData[7:0] ; 
+
+                   end
+          3 :
+                   begin
+                      next_state = 0 ; 
+
+				          temp_valid10 = 1'b0 ; 
+                         temp_v01a = rgResultData[23:16] ; 
+                         temp_v01b = rgResultData[15:8] ; 
+                         temp_v01c = rgResultData[7:0] ; 
+                         temp_valid01 = 1'b1 ; 
+
+                   end
+          4 :
+                   begin
+                      next_state = 5 ; 
+
+          				temp_valid01 = 1'b0 ; 
+				          temp_valid10 = 1'b0 ; 
+                         temp_id10c = rgResultData[15:0] ; 
+
+                         temp_hit10a = rgResultData[18] ; 
+                         temp_hit10b = rgResultData[17] ; 
+                         temp_hit10c = rgResultData[16] ; 
+
+                   end
+          5 :
+
+                   begin
+                      next_state = 6 ; 
+
+          				temp_valid01 = 1'b0 ; 
+				          temp_valid10 = 1'b0 ; 
+                         temp_u10a = rgResultData[23:16] ; 
+                         temp_u10b = rgResultData[15:8] ; 
+                         temp_u10c = rgResultData[7:0] ; 
+
+                   end
+          6 :
+                   begin
+                      next_state = 0 ; 
+
+      				temp_valid01 = 1'b0 ; 
+                         temp_v10a = rgResultData[23:16] ; 
+                         temp_v10b = rgResultData[15:8] ; 
+                         temp_v10c = rgResultData[7:0] ; 
+                         temp_valid10 = 1'b1 ; 
+
+                   end
+       endcase 
+    end 
+ endmodule
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+ module resultwriter (valid01, valid10, id01a, id01b, id01c, id10a, id10b, id10c, hit01a, hit01b, hit01c, hit10a, hit10b, hit10c, u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, addr, as01, as10, bkcolour, shadedata, triID, wantshadedata, shadedataready, texinfo, texaddr, texeladdr, texel, wanttexel, texelready, dataout, addrout, write, ack, globalreset, clk);
+
+    input valid01; 
+    input valid10; 
+    input[15:0] id01a; 
+    input[15:0] id01b; 
+    input[15:0] id01c; 
+    input[15:0] id10a; 
+    input[15:0] id10b; 
+    input[15:0] id10c; 
+
+    input hit01a; 
+    input hit01b; 
+    input hit01c; 
+    input hit10a; 
+    input hit10b; 
+    input hit10c; 
+    input[7:0] u01a; 
+    input[7:0] u01b; 
+    input[7:0] u01c; 
+    input[7:0] v01a; 
+    input[7:0] v01b; 
+    input[7:0] v01c; 
+
+    input[7:0] u10a; 
+    input[7:0] u10b; 
+    input[7:0] u10c; 
+    input[7:0] v10a; 
+    input[7:0] v10b; 
+    input[7:0] v10c; 
+    input[17:0] addr; 
+    input as01; 
+    input as10; 
+    input[20:0] bkcolour; 
+    input[63:0] shadedata; 
+    output[15:0] triID; 
+
+    reg[15:0] triID;
+    output wantshadedata; 
+    reg wantshadedata;
+    input shadedataready; 
+    input[20:0] texinfo; 
+    output[3:0] texaddr; 
+    wire[3:0] texaddr;
+    output[17:0] texeladdr; 
+    wire[17:0] texeladdr;
+    input[63:0] texel; 
+    output wanttexel; 
+    reg wanttexel;
+
+    input texelready; 
+    output[63:0] dataout; 
+    // PAJ see lower note wire[63:0] dataout;
+    reg[63:0] dataout;
+    output[17:0] addrout; 
+    wire[17:0] addrout;
+    output write; 
+    wire write;
+    reg temp_write;
+    input ack; 
+    input globalreset; 
+    input clk; 
+
+    reg[3:0] state; 
+    reg[3:0] next_state; 
+    reg pending01; 
+    reg pending10; 
+    reg process01; 
+    wire[17:0] addrout01; 
+    wire[17:0] addrout10; 
+    wire shiften01; 
+    wire shiften10; 
+    reg temp_shiften01; 
+    reg temp_shiften10; 
+    reg[20:0] shadedataa; 
+    reg[20:0] shadedatab; 
+    reg[20:0] shadedatac; 
+    wire hita; 
+    wire hitb; 
+    wire hitc; 
+
+    reg[2:0] selectuv; 
+    wire[6:0] blr; 
+    wire[6:0] blg; 
+    wire[6:0] blb; 
+    reg texmap; 
+    reg lmenable; 
+    wire[1:0] texelselect; 
+    wire[6:0] texelr; 
+    wire[6:0] texelg; 
+    wire[6:0] texelb; 
+    reg[20:0] texinfol; 
+
+    reg temp_pending01; 
+    reg temp_pending10; 
+    reg temp_process01; 
+    reg temp_texmap; 
+    reg[20:0] temp_texinfol; 
+    reg[20:0] temp_shadedataa; 
+    reg[20:0] temp_shadedatab; 
+    reg[20:0] temp_shadedatac; 
+
+    col16to21 col16to21inst (texel, texelselect, texelr, texelg, texelb); 
+    linearmap linearmapinst (blb, blg, texinfol[17:0], texeladdr, texelselect, texinfol[20:18], lmenable, clk); 
+    bilinearintrp bilinearimp (u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, selectuv, shadedata[41:35], shadedata[62:56], shadedata[20:14], shadedata[34:28], shadedata[55:49], shadedata[13:7], shadedata[27:21], shadedata[48:42], shadedata[6:0], blr, blg, blb, clk); 
+    fifo3 fifo3insta (addr, as01, addrout01, shiften01, globalreset, clk); 
+    fifo3 fifo3instb (addr, as10, addrout10, shiften10, globalreset, clk); 
+    assign hita = (hit01a & process01) | (hit10a & ~process01) ;
+    assign hitb = (hit01b & process01) | (hit10b & ~process01) ;
+    assign hitc = (hit01c & process01) | (hit10c & ~process01) ;
+    assign texaddr = shadedata[59:56] ;
+    assign shiften01 = temp_shiften01;
+    assign shiften10 = temp_shiften10;
+    assign write = temp_write;
+
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          state <= 0 ; 
+          pending01 <= 1'b0 ; 
+          pending10 <= 1'b0 ; 
+          shadedataa <= 0;
+          shadedatab <= 0;
+          shadedatac <= 0;
+          process01 <= 1'b0 ; 
+          texmap <= 1'b0 ; 
+
+          texinfol <= 0;
+       end
+       else
+       begin
+          state <= next_state ; 
+
+process01 <= temp_process01;
+pending01 <= temp_pending01;
+pending10 <= temp_pending10;
+texmap <= temp_texmap;
+texinfol <= temp_texinfol;
+shadedataa <= temp_shadedataa;
+shadedatab <= temp_shadedatab;
+shadedatac <= temp_shadedatac;
+
+    dataout <= {1'b0, 
+					shadedataa[20],
+					shadedataa[19],
+					shadedataa[18],
+					shadedataa[17],
+					shadedataa[16],
+					shadedataa[15],
+					shadedataa[14],
+					shadedataa[13],
+					shadedataa[12],
+					shadedataa[11],
+					shadedataa[10],
+					shadedataa[9],
+					shadedataa[8],
+					shadedataa[7],
+					shadedataa[6],
+					shadedataa[5],
+					shadedataa[4],
+					shadedataa[3],
+					shadedataa[2],
+					shadedataa[1],
+					shadedataa[0],
+					shadedatab[20],
+					shadedatab[19],
+					shadedatab[18],
+					shadedatab[17],
+					shadedatab[16],
+					shadedatab[15],
+					shadedatab[14],
+					shadedatab[13],
+					shadedatab[12],
+					shadedatab[11],
+					shadedatab[10],
+					shadedatab[9],
+					shadedatab[8],
+					shadedatab[7],
+					shadedatab[6],
+					shadedatab[5],
+					shadedatab[4],
+					shadedatab[3],
+					shadedatab[2],
+					shadedatab[1],
+					shadedatab[0],
+					shadedatac[20],
+					shadedatac[19],
+					shadedatac[18],
+					shadedatac[17],
+					shadedatac[16],
+					shadedatac[15],
+					shadedatac[14],
+					shadedatac[13],
+					shadedatac[12],
+					shadedatac[11],
+					shadedatac[10],
+					shadedatac[9],
+					shadedatac[8],
+					shadedatac[7],
+					shadedatac[6],
+					shadedatac[5],
+					shadedatac[4],
+					shadedatac[3],
+					shadedatac[2],
+					shadedatac[1],
+					shadedatac[0]} ;
+       end 
+//    end 
+// PAJ used to be assign, but weird error, so added as register   assign dataout = {1'b0, 
+    end 
+    assign addrout = (process01 == 1'b1) ? addrout01 : addrout10 ;
+
+    always @(state or process01 or pending10 or ack or shadedataready or id01a or 
+             id01b or id01c or id10a or id10b or id10c or selectuv or hita or 
+             hitb or hitc or shadedata or pending01 or texmap or texelready)
+    begin
+       case (state)
+          0 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       selectuv = 0;
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      if (pending01 == 1'b1 | pending10 == 1'b1)
+                      begin
+                         next_state = 2 ; 
+                      end
+                      else
+
+                      begin
+                         next_state = 0 ; 
+                      end 
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_process01 = pending01 ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          2 :
+                   begin
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      wantshadedata = 1'b1 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b00 ; 
+                      if (process01 == 1'b1)
+                      begin
+                         triID = id01a ; 
+
+                      end
+                      else
+                      begin
+                         triID = id10a ; 
+                      end 
+                      if (shadedataready == 1'b1)
+                      begin
+                         if (hita == 1'b1 & ((shadedata[63]) == 1'b1 | shadedata[63:62] == 2'b01))
+                         begin
+                            next_state = 3 ; 
+                         end
+                         else
+
+                         begin
+                            next_state = 4 ; 
+                         end 
+                      end
+                      else
+                      begin
+                         next_state = 2 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         if (hita == 1'b1)
+                         begin
+                            temp_shadedataa = shadedata[20:0] ; 
+                            temp_texmap = (~shadedata[63]) & shadedata[62] ; 
+                         end
+                         else
+                         begin
+                            temp_shadedataa = bkcolour ; 
+                         end 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          3 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+
+                      selectuv[1:0] = 2'b00 ; 
+                      next_state = 8 ; 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_texinfol = texinfo ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+
+                   end
+          8 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b00 ; 
+                      lmenable = 1'b1 ; 
+                      if (texmap == 1'b1)
+                      begin
+
+                         next_state = 11 ; 
+                      end
+                      else
+                      begin
+                         next_state = 4 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_shadedataa[6:0] = blb ; 
+                         temp_shadedataa[13:7] = blg ; 
+                         temp_shadedataa[20:14] = blr ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          11 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       selectuv = 0;
+				       lmenable = 1'b0 ; 
+
+                      wanttexel = 1'b1 ; 
+                      if (texelready == 1'b1)
+                      begin
+                         next_state = 4 ; 
+                      end
+                      else
+                      begin
+                         next_state = 11 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         temp_shadedataa[6:0] = texelb ; 
+                         temp_shadedataa[13:7] = texelg ; 
+                         temp_shadedataa[20:14] = texelr ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          12 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       selectuv = 0;
+				       lmenable = 1'b0 ; 
+
+                      wanttexel = 1'b1 ; 
+                      if (texelready == 1'b1)
+                      begin
+                         next_state = 5 ; 
+                      end
+                      else
+                      begin
+                         next_state = 12 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_shadedatab[6:0] = texelb ; 
+                         temp_shadedatab[13:7] = texelg ; 
+                         temp_shadedatab[20:14] = texelr ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          13 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       selectuv = 0;
+				       lmenable = 1'b0 ; 
+
+                      wanttexel = 1'b1 ; 
+                      if (texelready == 1'b1)
+                      begin
+                         next_state = 1 ; 
+                      end
+                      else
+                      begin
+                         next_state = 13 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         temp_shadedatac[6:0] = texelb ; 
+                         temp_shadedatac[13:7] = texelg ; 
+                         temp_shadedatac[20:14] = texelr ; 
+
+                   end
+          6 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b01 ; 
+                      next_state = 9 ; 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_texinfol = texinfo ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          9 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b01 ; 
+                      lmenable = 1'b1 ; 
+                      if (texmap == 1'b1)
+                      begin
+                         next_state = 12 ; 
+
+                      end
+                      else
+                      begin
+                         next_state = 5 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         temp_shadedatab[6:0] = blb ; 
+                         temp_shadedatab[13:7] = blg ; 
+                         temp_shadedatab[20:14] = blr ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          7 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b10 ; 
+                      next_state = 10 ; 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_texinfol = texinfo ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+
+          10 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b10 ; 
+                      if (texmap == 1'b1)
+                      begin
+                         next_state = 13 ; 
+                      end
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+
+                      lmenable = 1'b1 ; 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+                         temp_shadedatac[6:0] = blb ; 
+                         temp_shadedatac[13:7] = blg ; 
+                         temp_shadedatac[20:14] = blr ; 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          4 :
+                   begin
+				       wantshadedata = 1'b0 ; 
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b01 ; 
+                      if (process01 == 1'b1)
+                      begin
+                         triID = id01b ; 
+                      end
+                      else
+                      begin
+
+                         triID = id10b ; 
+                      end 
+                      if (shadedataready == 1'b1)
+                      begin
+                         if (hitb == 1'b1 & ((shadedata[63]) == 1'b1 | shadedata[63:62] == 2'b01))
+                         begin
+                            next_state = 6 ; 
+                         end
+                         else
+                         begin
+                            next_state = 5 ; 
+                         end 
+
+                      end
+                      else
+                      begin
+                         next_state = 4 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         if (hitb == 1'b1)
+                         begin
+                            temp_shadedatab = shadedata[20:0] ; 
+                            temp_texmap = (~shadedata[63]) & shadedata[62] ; 
+                         end
+                         else
+                         begin
+                            temp_shadedatab = bkcolour ; 
+                         end 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          5 :
+                   begin
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      wantshadedata = 1'b1 ; 
+                      selectuv[2] = ~process01 ; 
+                      selectuv[1:0] = 2'b10 ; 
+                      if (process01 == 1'b1)
+
+                      begin
+                         triID = id01c ; 
+                      end
+                      else
+                      begin
+                         triID = id10c ; 
+                      end 
+                      if (shadedataready == 1'b1)
+                      begin
+                         if (hitc == 1'b1 & ((shadedata[63]) == 1'b1 | shadedata[63:62] == 2'b01))
+                         begin
+                            next_state = 7 ; 
+
+                         end
+                         else
+                         begin
+                            next_state = 1 ; 
+                         end 
+                      end
+                      else
+                      begin
+                         next_state = 5 ; 
+                      end 
+
+	          if (valid01 == 1'b1)
+				          begin
+				             temp_pending01 = 1'b1 ; 
+				          end 
+				          if (valid10 == 1'b1)
+				          begin
+				             temp_pending10 = 1'b1 ; 
+				          end 
+
+                         if (hitc == 1'b1)
+                          begin
+                            temp_shadedatac = shadedata[20:0] ; 
+                            temp_texmap = (~shadedata[63]) & shadedata[62] ; 
+                         end
+                         else
+                         begin
+                            temp_shadedatac = bkcolour ; 
+                         end 
+
+							temp_shiften01 = 1'b0;
+							temp_shiften10 = 1'b0;
+					    	temp_write = 1'b0;
+                   end
+          1 :
+
+                   begin
+				       wantshadedata = 1'b0 ; 
+   				       triID = 0;
+				       selectuv = 0;
+				       lmenable = 1'b0 ; 
+				       wanttexel = 1'b0 ; 
+                      if (ack == 1'b1)
+                      begin
+                         next_state = 0 ; 
+                      end
+                      else
+                      begin
+                         next_state = 1 ; 
+                      end 
+
+                         if (ack == 1'b1 & process01 == 1'b1)
+                         begin
+                            temp_pending01 = 1'b0 ; 
+                         end
+
+                          else if (ack == 1'b1 & process01 == 1'b0)
+                         begin
+                            temp_pending10 = 1'b0 ; 
+                         end 
+
+    				if (process01 == 1'b1 &  ack == 1'b1)
+						begin
+							temp_shiften01 = 1'b1;
+							temp_shiften10 = 1'b1;
+						end
+					    temp_write = 1'b1;
+                   end
+       endcase 
+    end 
+ endmodule
+ //////////////////////////////////////////////////////////////////////////////////////////////
+ //
+ // Verilog file generated by X-HDL - Revision 3.2.38  Jan. 9, 2004 
+ // Sun Feb  8 14:14:35 2004
+ //
+ //      Input file         : G:/jamieson/VERILOG_BENCHMARKS/RAYTRACE/col16to21.vhd
+ //      Design name        : col16to21
+ //      Author             : 
+ //      Company            : 
+ //
+ //      Description        : 
+ //
+ //
+ //////////////////////////////////////////////////////////////////////////////////////////////
+ //
+ module col16to21 (dataline, texelselect, r, g, b);
+
+    input[63:0] dataline; 
+    input[1:0] texelselect; 
+    output[6:0] r; 
+    wire[6:0] r;
+    output[6:0] g; 
+    wire[6:0] g;
+    output[6:0] b; 
+    wire[6:0] b;
+
+    reg[15:0] col16; 
+
+    always @(dataline or texelselect)
+    begin
+       case (texelselect)
+          2'b00 :
+                   begin
+                      col16 = dataline[15:0] ; 
+                   end
+          2'b01 :
+                   begin
+                      col16 = dataline[31:16] ; 
+                   end
+          2'b10 :
+                   begin
+                      col16 = dataline[47:32] ; 
+                   end
+          2'b11 :
+                   begin
+                      col16 = dataline[63:48] ; 
+                   end
+       endcase 
+    end 
+    assign r = {col16[15:10], 1'b0} ;
+    assign g = {col16[9:5], 2'b00} ;
+    assign b = {col16[4:0], 2'b00} ;
+ endmodule
+ module linearmap (u, v, start, addr, texelselect, factor, enable, clk);
+
+    input[6:0] u; 
+    input[6:0] v; 
+    input[17:0] start; 
+    output[17:0] addr; 
+    reg[17:0] addr;
+    output[1:0] texelselect; 
+    wire[1:0] texelselect;
+
+    input[2:0] factor; 
+    input enable; 
+    input clk; 
+
+    reg[6:0] ul; 
+    reg[6:0] vl; 
+
+    assign texelselect = ul[1:0] ;
+
+    always @(posedge clk)
+    begin
+       if (enable == 1'b1)
+       begin
+          ul <= u ; 
+          vl <= v ; 
+       end 
+       else
+       begin
+          ul <= ul ; 
+          vl <= vl ; 
+       end 
+       case (factor)
+          3'b000 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({11'b00000000000, vl}) ; 
+                   end
+          3'b001 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({10'b0000000000, vl, 1'b0}) ; 
+
+                   end
+          3'b010 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({9'b000000000, vl, 2'b00}) ; 
+                   end
+          3'b011 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({8'b00000000, vl, 3'b000}) ; 
+                   end
+          3'b100 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({7'b0000000, vl, 4'b0000}) ; 
+
+                   end
+          3'b101 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({6'b000000, vl, 5'b00000}) ; 
+                   end
+          3'b110 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({5'b00000, vl, 6'b000000}) ; 
+                   end
+          3'b111 :
+                   begin
+                      addr <= start + ({13'b0000000000000, ul[6:2]}) + ({4'b0000, vl, 7'b0000000}) ; 
+
+                   end
+       endcase  
+    end 
+ endmodule
+     module bilinearintrp (u01a, u01b, u01c, v01a, v01b, v01c, u10a, u10b, u10c, v10a, v10b, v10c, selectuv, ru, rv, rw, gu, gv, gw, bu, bv, bw, r, g, b, clk);
+
+        input[7:0] u01a; 
+        input[7:0] u01b; 
+        input[7:0] u01c; 
+        input[7:0] v01a; 
+        input[7:0] v01b; 
+        input[7:0] v01c; 
+        input[7:0] u10a; 
+        input[7:0] u10b; 
+        input[7:0] u10c; 
+        input[7:0] v10a; 
+        input[7:0] v10b; 
+        input[7:0] v10c; 
+        input[2:0] selectuv; 
+        input[6:0] ru; 
+        input[6:0] rv; 
+        input[6:0] rw; 
+        input[6:0] gu; 
+        input[6:0] gv; 
+        input[6:0] gw; 
+        input[6:0] bu; 
+        input[6:0] bv; 
+        input[6:0] bw; 
+        output[6:0] r; 
+        wire[6:0] r;
+        output[6:0] g; 
+        wire[6:0] g;
+        output[6:0] b; 
+        wire[6:0] b;
+        input clk; 
+
+        reg[7:0] u; 
+        reg[7:0] v; 
+        reg[7:0] ul; 
+        reg[7:0] vl; 
+        reg[7:0] wl; 
+        reg[14:0] i1b; 
+        reg[14:0] i2b; 
+        reg[14:0] i3b; 
+        reg[14:0] i1g; 
+        reg[14:0] i2g; 
+        reg[14:0] i3g; 
+        reg[14:0] i1r; 
+        reg[14:0] i2r; 
+        reg[14:0] i3r; 
+        reg[6:0] rul; 
+        reg[6:0] rvl; 
+        reg[6:0] rwl; 
+        reg[6:0] gul; 
+        reg[6:0] gvl; 
+        reg[6:0] gwl; 
+        reg[6:0] bul; 
+        reg[6:0] bvl; 
+        reg[6:0] bwl; 
+
+        always @(selectuv or u01a or u01b or u01c or v01a or v01b or v01c or u10a or 
+                 u10b or u10c or v10a or v10b or v10c)
+        begin
+           case (selectuv)
+              3'b000 :
+                       begin
+                          u = u01a ; 
+                          v = v01a ; 
+                       end
+              3'b001 :
+                       begin
+                          u = u01b ; 
+						 v = v01b ; 
+                       end
+              3'b010 :
+                       begin
+                          u = u01c ; 
+                          v = v01c ; 
+                       end
+              3'b100 :
+                       begin
+                          u = u10a ; 
+                          v = v10a ; 
+                       end
+              3'b101 :
+                       begin
+                          u = u10b ; 
+                          v = v10b ; 
+                       end
+              3'b110 :
+                       begin
+                          u = u10c ; 
+                          v = v10c ; 
+                       end
+              default :
+                       begin
+                          u = 0;
+                          v = 0;
+                       end
+           endcase 
+        end 
+
+        always @(posedge clk)
+        begin
+           wl <= 8'b11111111 - u - v ; 
+           ul <= u ; 
+           vl <= v ; 
+           rul <= ru ; 
+           rvl <= rv ; 
+           rwl <= rw ; 
+           gul <= gu ; 
+           gvl <= gv ; 
+           gwl <= gw ; 
+           bul <= bu ; 
+           bvl <= bv ; 
+           bwl <= bw ; 
+           i1r <= ul * rul ; 
+           i2r <= vl * rvl ; 
+           i3r <= wl * rwl ; 
+           i1g <= ul * gul ; 
+           i2g <= vl * gvl ; 
+           i3g <= wl * gwl ; 
+           i1b <= ul * bul ; 
+           i2b <= vl * bvl ; 
+           i3b <= wl * bwl ;  
+        end 
+        assign r = (i1r + i2r + i3r) ;
+        assign g = (i1g + i2g + i3g) ;
+        assign b = (i1b + i2b + i3b) ;
+     endmodule
+
+
+
+module fifo3 (datain, writeen, dataout, shiften, globalreset, clk);
+
+    input[18 - 1:0] datain; 
+    input writeen; 
+    output[18 - 1:0] dataout; 
+    wire[18 - 1:0] dataout;
+    input shiften; 
+    input globalreset; 
+    input clk; 
+
+    reg[18 - 1:0] data0; 
+    reg[18 - 1:0] data1; 
+    reg[18 - 1:0] data2; 
+
+    reg[1:0] pos; 
+
+    assign dataout = data0 ;
+
+    always @(posedge clk)
+    begin
+       if (globalreset == 1'b1)
+       begin
+          pos <= 2'b00 ; 
+          data0 <= 0 ; 
+          data1 <= 0 ; 
+          data2 <= 0 ; 
+       end
+       else
+       begin
+          if (writeen == 1'b1 & shiften == 1'b1)
+          begin
+             case (pos)
+                2'b00 :
+                         begin
+                            data0 <= 0 ; 
+                            data1 <= 0 ; 
+                            data2 <= 0 ; 
+                         end
+
+                2'b01 :
+                         begin
+                            data0 <= datain ; 
+                            data1 <= 0 ; 
+                            data2 <= 0 ; 
+                         end
+                2'b10 :
+                         begin
+                            data0 <= data1 ; 
+                            data1 <= datain ; 
+                            data2 <= 0 ; 
+                         end
+
+                2'b11 :
+                         begin
+                            data0 <= data1 ; 
+                            data1 <= data2 ; 
+                            data2 <= datain ; 
+                         end
+             endcase 
+          end
+          else if (shiften == 1'b1)
+          begin
+             data0 <= data1 ; 
+             data1 <= data2 ; 
+             pos <= pos - 1 ; 
+          end
+          else if (writeen == 1'b1)
+          begin
+             case (pos)
+                2'b00 :
+                         begin
+                            data0 <= datain ; 
+                         end
+                2'b01 :
+    					begin
+                            data1 <= datain ; 
+                         end
+                2'b10 :
+                         begin
+                            data2 <= datain ; 
+                         end
+             endcase 
+             pos <= pos + 1 ; 
+          end 
+       end 
+    end 
+ endmodule
+