From 9bb831378e8338961a83637e39f1e2f88428c602 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 12 Apr 2024 21:30:13 +0200
Subject: [PATCH 01/26] Introduce support for generic elementwise binary
 operations

This includes a set of HWCustomOp and HLSBackend operator templates
which can be specialized in just a few lines of code to implement
arbitrary elementwise binary operations, like Add, Mul, Sub, Div,
And, Equal, etc., supporting multidirectional  broadcasting. Concrete
implementations for most of these operators according to standard ONNX
is already sketched out. Still missing are specializations for
accumulator and weight bit-width minimization and some tricky to
implement operators. Also still missing is floating-point support due
to HLS-backend limitations, though these *seem* to be just minor defects
regarding "flatten" and "Slice".

Adds unit tests in Python, C++ and RTL simulation for these new
operators, though these are probably not exhaustive enough to validate
all edge cases.

Proposes a new scheme for registering and importing custom operators
into their corresponding module namespace, i.e., the 'custom_op'
dictionary used to lookup operators by ONNX domain.
---
 .isort.cfg                                    |   3 +
 custom_hls/flatten.hpp                        |  26 +
 src/finn/custom_op/fpgadataflow/__init__.py   |  29 +-
 .../fpgadataflow/elementwise_binary.py        | 552 ++++++++++++++
 .../custom_op/fpgadataflow/hls/__init__.py    |  33 +-
 .../hls/elementwise_binary_hls.py             | 703 ++++++++++++++++++
 src/finn/custom_op/fpgadataflow/templates.py  |   2 +
 tests/fpgadataflow/test_elementwise_binary.py | 363 +++++++++
 8 files changed, 1707 insertions(+), 4 deletions(-)
 create mode 100644 custom_hls/flatten.hpp
 create mode 100644 src/finn/custom_op/fpgadataflow/elementwise_binary.py
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
 create mode 100644 tests/fpgadataflow/test_elementwise_binary.py
diff --git a/.isort.cfg b/.isort.cfg
index 5378b88fad..efb7a4a352 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -9,3 +9,6 @@ sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
 default_section=THIRDPARTY
 multi_line_output=3
 profile=black
+ignore_comments=true
+ignore_whitespace=true
+honor_noqa=true
diff --git a/custom_hls/flatten.hpp b/custom_hls/flatten.hpp
new file mode 100644
index 0000000000..7b73077b48
--- /dev/null
+++ b/custom_hls/flatten.hpp
@@ -0,0 +1,26 @@
+#ifndef FLATTEN_HPP
+#define FLATTEN_HPP
+
+// HLS arbitrary precision types
+#include <ap_int.h>
+
+// Flattens an array of N elements of Type into a single bitvector
+template<long unsigned N, class Type>
+    ap_uint<N * Type::width> flatten(const Type *buffer) {
+// Inline this small piece of bit merging logic
+#pragma HLS INLINE
+        // Fill a flat word of N times the bit-width of the element type
+        ap_uint<N * Type::width> flat;
+        // Merge all N chunks of the tile into the flat bitvector
+        for(unsigned j = 0; j < N; ++j) {
+// Do the merging of all chunks in parallel
+#pragma HLS UNROLL
+            // Insert the chunk into the right place of the
+            // bitvector
+            flat((j + 1) * Type::width - 1, j * Type::width) = buffer[j];
+        }
+        // Return the buffer flattened into a single bitvector
+        return flat;
+    }
+
+#endif // FLATTEN_HPP
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..4f2f69445e 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -27,6 +27,33 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# The base class of all generic custom operations before specializing to either
+# HLS or RTL backend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Dictionary of HWCustomOp implementations
+custom_op = dict()
+
+
+# Registers a class into the custom_op dictionary
+# Note: This must be defined first, before importing any custom op
+# implementation to avoid "importing partially initialized module" issues.
+def register_custom_op(cls):
+    # The class must actually implement HWCustomOp
+    assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
+    # Insert the class into the custom_op dictionary by its name
+    custom_op[cls.__name__] = cls  # noqa: Some weird type annotation issue?
+    # Pass through the class unmodified
+    return cls
+
+
+# flake8: noqa
+# Disable linting from here, as all import will be flagged E402 and maybe F401
+
+
+# Import the submodule containing specializations of ElementwiseBinaryOperation
+# Note: This will automatically register all decorated classes into this domain
+import finn.custom_op.fpgadataflow.elementwise_binary
 from finn.custom_op.fpgadataflow.addstreams import AddStreams
 from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
 from finn.custom_op.fpgadataflow.concat import StreamingConcat
@@ -55,8 +82,6 @@
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
 
-custom_op = dict()
-
 # make sure new HLSCustomOp subclasses are imported here so that they get
 # registered and plug in correctly into the infrastructure
 custom_op["MVAU"] = MVAU
diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
new file mode 100644
index 0000000000..34b788761c
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -0,0 +1,552 @@
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Numpy math and arrays
+import numpy as np
+
+# Operating system stuff, e.g. paths
+import os
+
+# Python warning subsystem
+import warnings
+
+# Helper for creating ONNX nodes
+from onnx import helper as oh
+
+# QONNX/FINN datatypes
+from qonnx.core.datatype import DataType
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Utility for registering HWCustomOp implementations into the module scope
+from finn.custom_op.fpgadataflow import register_custom_op
+
+# Derive custom operators form the FINN base custom op
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Converts inputs/outputs to/from RTL simulation format
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+# Generic implementation for elementwise binary operations
+class ElementwiseBinaryOperation(HWCustomOp):
+    # Specifies the elementwise operation to be implemented
+    #   Format: (Identifier, Python, C++, RTL)
+    _operation: tuple[str, np.ufunc, str, str] | None = None
+
+    # Numpy operation available as property
+    @property
+    def npy_op(self) -> np.ufunc:
+        return self._operation[1]
+
+    # C++ operation template available as property
+    @property
+    def cpp_op(self) -> str:
+        return self._operation[2]
+
+    # RTL operation template available as property
+    @property
+    def rtl_op(self) -> str:
+        return self._operation[3]
+
+    # Initializes the operator given an onnx graph node
+    def __init__(self, onnx_node, **kwargs):
+        # Just forward all arguments to the init method of the CustomOp base
+        super().__init__(onnx_node, **kwargs)
+
+    # Defines attributes which must be present on this node
+    def get_nodeattr_types(self):
+        # Start from parent operator class attributes
+        attrs = HWCustomOp.get_nodeattr_types(self)
+        # Update attributes dictionary for new custom operator
+        attrs.update({
+            # Data type of the left-hand-side input elements
+            "lhs_dtype": ("s", True, ""),
+            # Data type of the right-hand-side input elements
+            "rhs_dtype": ("s", True, ""),
+            # Data type of the output elements
+            "out_dtype": ("s", True, ""),
+            # Shape of the left-hand-side input
+            "lhs_shape": ("ints", True, [1]),
+            # Shape of the right-hand-side input
+            "rhs_shape": ("ints", True, [1]),
+            # Shape of the output, mus correspond to multi-directional
+            # broadcasting of the left- and right-hand-side
+            "out_shape": ("ints", True, [1]),
+            # Style specifies how the left-hand-side input is provided
+            #   Note: Might be inferred from the context
+            "lhs_style": ("s", False, "input", {"input", "const"}),
+            # Style specifies how the right-hand-side input is provided
+            #   Note: Might be inferred from the context
+            "rhs_style": ("s", False, "input", {"input", "const"}),
+            # Number of elements in the last dimensions processed in parallel
+            "PE": ("i", True, 1),
+            # Possible execution modes for simulating this node
+            #   Note: Override to support python mode
+            "exec_mode": (
+                "s", False, "python", {"", "rtlsim", "cppsim", "python"}
+            ),
+            # Input and output FIFO depths for multi-I/O nodes
+            #   Note: Need to override here as there multiple outputs
+            "inFIFODepths": ("ints", False, [2, 2]),
+            "outFIFODepths": ("ints", False, []),  # Default will be override
+        })
+        # Return updated attribute dictionary
+        return attrs
+
+    # Datatype attribute as property for convenience
+    @property
+    def lhs_dtype(self):
+        # Note: Converts from string to QONNX data type
+        return DataType[self.get_nodeattr("lhs_dtype")]
+
+    # Datatype attribute as property for convenience
+    @property
+    def rhs_dtype(self):
+        # Note: Converts from string to QONNX data type
+        return DataType[self.get_nodeattr("rhs_dtype")]
+
+    # Datatype attribute as property for convenience
+    @property
+    def out_dtype(self):
+        # Note: Converts from string to QONNX data type
+        return DataType[self.get_nodeattr("out_dtype")]
+
+    # Shape attribute as property for convenience
+    @property
+    def lhs_shape(self):
+        return self.get_nodeattr("lhs_shape")
+
+    # Shape attribute as property for convenience
+    @property
+    def rhs_shape(self):
+        return self.get_nodeattr("rhs_shape")
+
+    # Shape attribute as property for convenience
+    @property
+    def out_shape(self):
+        return self.get_nodeattr("out_shape")
+
+    # Style attribute as property for convenience
+    @property
+    def lhs_style(self):
+        return self.get_nodeattr("lhs_style")
+
+    # Style attribute as property for convenience
+    @property
+    def rhs_style(self):
+        return self.get_nodeattr("rhs_style")
+
+    # Number of parallel processed elements as property for convenience
+    @property
+    def pe(self):
+        return self.get_nodeattr("PE")
+
+    # Checks whether the last axis is broadcast
+    @property
+    def broadcast_last_axis(self):
+        return (self.lhs_shape[-1] == 1) != (self.rhs_shape[-1] == 1)
+
+    # Makes an operation compatible with the output shape for shape inference
+    #   Note: Propagates shape forward, i.e., never asks for the shape of the
+    #   output, even if it seems easier.
+    def make_shape_compatible_op(self, model: ModelWrapper):  # noqa
+        # Get the node wrapped by this custom op
+        node = self.onnx_node
+        # There must be exactly two inputs to the binary operation
+        assert len(node.input) == 2, \
+            f"Binary operation {node.name} requires exactly two inputs"
+        # Validate input shapes match what is stored as attributes
+        assert model.get_tensor_shape(node.input[0]) == self.lhs_shape, \
+            f"Input shape mismatch: {node.name} {node.input[0]}"
+        assert model.get_tensor_shape(node.input[1]) == self.rhs_shape, \
+            f"Input shape mismatch: {node.name} {node.input[1]}"
+        # Validate broadcasting of inputs to the output shape
+        assert (list(np.broadcast_shapes(self.lhs_shape, self.rhs_shape))
+                == self.out_shape), f"Shape broadcast mismatch: {node.name}"
+        # Simulate behavior via the standard ONNX add operation
+        return oh.make_node("Add", node.input, node.output)
+
+    # Infers the datatype of the node output
+    def infer_node_datatype(self, model: ModelWrapper):  # noqa
+        # Get the node wrapped by this custom op  # noqa Duplicate
+        node = self.onnx_node
+        # Test for changing left-hand-side input datatype
+        if model.get_tensor_datatype(node.input[0]) != self.lhs_dtype:
+            # Get the new datatype
+            new_dtype = model.get_tensor_datatype(node.input[0])
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: lhs_dtype changing from"
+                f" {self.lhs_dtype} to {new_dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("lhs_dtype", new_dtype.name)
+        # Test for changing right-hand-side input datatype
+        if model.get_tensor_datatype(node.input[1]) != self.rhs_dtype:
+            # Get the new datatype
+            new_dtype = model.get_tensor_datatype(node.input[1])
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: rhs_dtype changing from"
+                f" {self.rhs_dtype} to {new_dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("rhs_dtype", new_dtype.name)
+        # Force the output data type stored as a node attribute
+        model.set_tensor_datatype(node.output[0], self.out_dtype)
+
+    # Executes elementwise operation in python
+    def _execute_node_python(self, context, graph):  # noqa: graph unused
+        # Get the node wrapped by this custom op
+        node = self.onnx_node
+        # Get the inputs out of the execution context
+        lhs = context[node.input[0]]
+        rhs = context[node.input[1]]
+        # Apply elementwise operation with broadcasting in numpy and insert
+        # result into the execution context
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Assume out_type to be always of the same kind as the inputs but
+        # with bit-width >= the bit-width of either of the inputs. Then,
+        # representing the inputs as out_type for numpy simulation is safe.
+        out = self.npy_op(
+            lhs.astype(self.out_dtype.to_numpy_dt()),
+            rhs.astype(self.out_dtype.to_numpy_dt())
+        )
+        # Make sure the output has the right type, e.g. turn all booleans into
+        # integers if configured by the attribute
+        # Note: This is relevant for logical ops, ==, <=, >=, etc.
+        # Note: Somehow QONNX does not like boolean tensors
+        context[node.output[0]] = out.astype(self.out_dtype.to_numpy_dt())
+
+    # Executes elementwise operation in C++ simulation
+    def _execute_node_cppsim(self, context, graph):  # noqa: graph unused
+        # C++ Simulation needs to be implemented in HLS backend specialization
+        raise NotImplementedError(
+            f"exec_mode cppsim of {self.__class__.__name__} is not implemented!"
+        )
+
+    # Executes elementwise operation in RTL simulation
+    def _execute_node_rtlsim(self, context, graph):  # noqa: graph unused
+        # Get the node wrapped by this custom op  # noqa Duplicate
+        node = self.onnx_node
+        # Input data is stored in numpy files in the code generation dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        # Get the inputs out of the execution context
+        lhs = context[node.input[0]]  # noqa: Duplicate code prepare simulation
+        rhs = context[node.input[1]]
+        # Validate the shape of the inputs
+        assert list(lhs.shape) == self.get_normal_input_shape(ind=0), \
+            f"Input shape mismatch for {node.input[0]}"
+        assert list(rhs.shape) == self.get_normal_input_shape(ind=1), \
+            f"Input shape mismatch for {node.input[1]} {rhs.shape=}"
+        # Reshape the inputs into folded form
+        lhs = lhs.reshape(self.get_folded_input_shape(ind=0))
+        rhs = rhs.reshape(self.get_folded_input_shape(ind=1))
+        # Path to store the intermediate inputs in numpy format
+        lhs_filename = os.path.join(code_gen_dir, "lhs.npy")
+        rhs_filename = os.path.join(code_gen_dir, "rhs.npy")
+        # Save the folded inputs to file to be used by simulation
+        np.save(lhs_filename, lhs)
+        np.save(rhs_filename, rhs)
+        # Start collecting inputs/outputs to the RTL simulation in a dictionary
+        #   Note: Prepare one output empty output list
+        io_dict = {
+            "inputs": {},
+            "outputs": {"out": []}
+        }
+        # Type and width of the input tensors
+        lhs_dtype = self.get_input_datatype(ind=0)
+        lhs_width = self.get_instream_width(ind=0)
+        rhs_dtype = self.get_input_datatype(ind=1)
+        rhs_width = self.get_instream_width(ind=1)
+
+        # If the left-hand-side is provided as runtime input it needs to be
+        # inserted into the RTL simulation inputs
+        if self.lhs_style == "input":
+            # Convert inputs to RTL simulation format
+            io_dict["inputs"]["lhs"] = npy_to_rtlsim_input(
+                lhs_filename, lhs_dtype, lhs_width
+            )
+
+        # If the right-hand-side is provided as runtime input it needs to be
+        # inserted into the RTL simulation inputs
+        if self.rhs_style == "input":
+            # Convert inputs to RTL simulation format
+            io_dict["inputs"]["rhs"] = npy_to_rtlsim_input(
+                rhs_filename, rhs_dtype, rhs_width
+            )
+
+        # Setup PyVerilator simulation of the node
+        sim = self.get_rtlsim()
+        # Reset the RTL simulation
+        super().reset_rtlsim(sim)
+        super().toggle_clk(sim)
+        # Run the RTL Simulation
+        self.rtlsim_multi_io(sim, io_dict)
+
+        # Collect the output from RTL simulation
+        out = io_dict["outputs"]["out"]
+        # Type and sizes of the output tensor
+        dtype = self.get_output_datatype(ind=0)  # noqa: Duplicate readout code
+        width = self.get_outstream_width(ind=0)
+        shape = self.get_folded_output_shape(ind=0)
+        # Path to store the intermediate numpy file
+        filename = os.path.join(code_gen_dir, "out.npy")
+        # Convert from RTL simulation format to numpy format
+        rtlsim_output_to_npy(
+            out, filename, dtype, shape, width, dtype.bitwidth()
+        )
+        # Load the generated output numpy file
+        out = np.load(filename)
+        # Reshape the folded output and insert into the execution context
+        context[node.output[0]] = out.reshape(
+            self.get_normal_output_shape(ind=0)
+        )
+
+    # Executes elementwise op in simulation (either python c++ or rtl sim)
+    def execute_node(self, context, graph):
+        # Get the configured execution mode
+        mode = self.get_nodeattr("exec_mode")
+        # Lookup table mapping execution modes to implementing methods
+        exec_fns = {
+            "python": self._execute_node_python,
+            "cppsim": self._execute_node_cppsim,
+            "rtlsim": self._execute_node_rtlsim,
+        }
+        # Select and execute the function by mode string
+        exec_fns[mode](context, graph)
+
+    # Verifies the node attributes, inputs and outputs
+    def verify_node(self):
+        # TODO: Implement
+        return []
+
+    # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff
+
+    # Gets the datatype of input at index ind
+    def get_input_datatype(self, ind=0):
+        # Get input data type by index, order inputs from left to right
+        return [self.lhs_dtype, self.rhs_dtype][ind]
+
+    # Gets the datatype of the output at index ind
+    def get_output_datatype(self, ind=0):
+        # There is only one output, the type is set as an attribute
+        return self.out_dtype
+
+    # Gets the shape of the input at index ind without folding
+    def get_normal_input_shape(self, ind=0):
+        # Input shapes are stored as a node attributes
+        return [self.lhs_shape, self.rhs_shape][ind]
+
+    # Gets the shape of the output at index ind without folding
+    def get_normal_output_shape(self, ind=0):
+        # The output shape is stored as a node attribute
+        return self.out_shape
+
+    # Gets the shape of the input at index ind with folding
+    def get_folded_input_shape(self, ind=0):
+        # Get the normal shape before applying folding
+        *num_inputs, num_elems = self.get_normal_input_shape(ind=ind)
+        # Folding only applies if the folded axis is not broadcast
+        if not self.broadcast_last_axis or num_elems != 1:
+            # Valid folding requires the PE to divide the number of elements
+            assert num_elems % self.pe == 0, "PE must divide last axis"
+            # Folding along the last dimension
+            return *num_inputs, num_elems // self.pe, self.pe
+        # For broadcast axes return the non-folded shape with dummy axis
+        # inserted
+        return *num_inputs, 1, num_elems
+
+    # Gets the shape of the output at index ind with folding
+    def get_folded_output_shape(self, ind=0):
+        # Get the normal shape before applying folding
+        *num_inputs, num_elems = self.get_normal_output_shape(ind=ind)
+        # Valid folding requires the PE to divide the number of elements
+        assert num_elems % self.pe == 0, "PE must divide last axis"
+        # Folding along the last dimension
+        return *num_inputs, num_elems // self.pe, self.pe
+
+    # Widths of the input data stream of the input at index ind
+    def get_instream_width(self, ind=0):
+        # Get the number of bits used to represent the input
+        i_bits = self.get_input_datatype(ind).bitwidth()
+        # Parallelism is the number of elements in the last dimension of the
+        # folded input
+        *_, elems = self.get_folded_input_shape(ind)
+        # Width of a stream receiving input elements in parallel
+        return elems * i_bits
+
+    # Widths of the output data stream of the output at index ind
+    def get_outstream_width(self, ind=0):
+        # Get the number of bits used to represent the output
+        o_bits = self.get_output_datatype(ind).bitwidth()
+        # Parallelism is the number of elements in the last dimension of the
+        # folded output
+        *_, elems = self.get_folded_output_shape(ind)
+        # Width of a stream producing output elements in parallel
+        return elems * o_bits
+
+    # Gets the number of expected output values, i.e. how many times read()
+    # could/should be called on any output stream of this operator
+    def get_number_output_values(self):
+        # Elements over all but the last dimension of the output folded along
+        # the embedding dimension.
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    # Minimizes the width of the accumulator data type, 'accumulator width' here
+    # due to convention, it is actually the output data type
+    def minimize_accumulator_width(self, model: ModelWrapper):
+        # Depends on the actual operation performed and must be specialized by
+        # the concrete implementations
+        raise NotImplementedError(
+            f"minimize_accumulator_width of {self.__class__.__name__}"
+            f" is not implemented!"
+        )
+
+    # Minimizes the width of the weight data type, 'weight' here due to
+    # convention, it actually applies to any constant initializer input
+    def minimize_weight_bit_width(self, model: ModelWrapper):
+        # TODO: Can actually be implemented in the base class. Without this
+        #  might just be inefficient for now.
+        pass
+
+
+# Derive a specialization to implement elementwise addition of two inputs
+@register_custom_op
+class ElementwiseAdd(ElementwiseBinaryOperation):
+    # Specialize to implement the addition operation of left hand side and right
+    # hand side input
+    _operation = "Add", np.add, "({0} + {1})", None
+
+
+# Derive a specialization to implement elementwise subtraction of two inputs
+@register_custom_op
+class ElementwiseSub(ElementwiseBinaryOperation):
+    # Specialize to implement the subtraction operation of left hand side and
+    # right hand side input
+    _operation = "Sub", np.subtract, "({0} - {1})", None
+
+
+# Derive a specialization to implement elementwise multiplication of two inputs
+@register_custom_op
+class ElementwiseMul(ElementwiseBinaryOperation):
+    # Specialize to implement the multiplication operation of left hand side and
+    # right hand side input
+    _operation = "Mul", np.multiply, "({0} * {1})", None
+
+
+# Derive a specialization to implement elementwise division of two inputs
+@register_custom_op
+class ElementwiseDiv(ElementwiseBinaryOperation):
+    # TODO: Not tested due to divide by zero from randomly generated inputs...
+    # Specialize to implement the division operation of left hand side and
+    # right hand side input
+    _operation = "Div", np.divide, "({0} / {1})", None
+
+
+# TODO: ElementwiseMod - Requires extra attribute selecting the function
+
+
+# Derive a specialization to implement elementwise logical and of two inputs
+@register_custom_op
+class ElementwiseAnd(ElementwiseBinaryOperation):
+    # Specialize to implement the logical and operation of left hand side and
+    # right hand side input
+    _operation = "And", np.logical_and, "({0} && {1})", None
+
+
+# Derive a specialization to implement elementwise logical or of two inputs
+@register_custom_op
+class ElementwiseOr(ElementwiseBinaryOperation):
+    # Specialize to implement the logical or operation of left hand side and
+    # right hand side input
+    _operation = "Or", np.logical_or, "({0} || {1})", None
+
+
+# Derive a specialization to implement elementwise logical xor of two inputs
+@register_custom_op
+class ElementwiseXor(ElementwiseBinaryOperation):
+    # Specialize to implement the logical xor operation of left hand side and
+    # right hand side input
+    _operation = "Xor", np.logical_xor, "(bool({0}) != bool({1}))", None
+
+
+# Derive a specialization to implement elementwise equality of two inputs
+@register_custom_op
+class ElementwiseEqual(ElementwiseBinaryOperation):
+    # Specialize to implement the logical equal operation of left hand side and
+    # right hand side input
+    _operation = "Equal", np.equal, "({0} == {1})", None
+
+
+# Derive a specialization to implement elementwise less of two inputs
+@register_custom_op
+class ElementwiseLess(ElementwiseBinaryOperation):
+    # Specialize to implement the logical less operation of left hand side and
+    # right hand side input
+    _operation = "Less", np.less, "({0} < {1})", None
+
+
+# Derive a specialization to implement elementwise less or equal of two inputs
+@register_custom_op
+class ElementwiseLessOrEqual(ElementwiseBinaryOperation):
+    # Specialize to implement the logical less or equal operation of left hand
+    # side and right hand side input
+    _operation = "LessOrEqual", np.less_equal, "({0} <= {1})", None
+
+
+# Derive a specialization to implement elementwise greater of two inputs
+@register_custom_op
+class ElementwiseGreater(ElementwiseBinaryOperation):
+    # Specialize to implement the logical greater operation of left hand side
+    # and right hand side input
+    _operation = "Greater", np.greater, "({0} > {1})", None
+
+
+# Derive a specialization to implement elementwise greater or equal of two
+# inputs
+@register_custom_op
+class ElementwiseGreaterOrEqual(ElementwiseBinaryOperation):
+    # Specialize to implement the logical greater or equal operation of left
+    # hand side and right hand side input
+    _operation = "GreaterOrEqual", np.greater_equal, "({0} >= {1})", None
+
+
+# Derive a specialization to implement elementwise bitwise and of two inputs
+@register_custom_op
+class ElementwiseBitwiseAnd(ElementwiseBinaryOperation):
+    # Specialize to implement the bitwise and operation of left hand side and
+    # right hand side input
+    _operation = "BitwiseAnd", np.bitwise_and, "({0} & {1})", None
+
+
+# Derive a specialization to implement elementwise bitwise or of two inputs
+@register_custom_op
+class ElementwiseBitwiseOr(ElementwiseBinaryOperation):
+    # Specialize to implement the bitwise or operation of left hand side and
+    # right hand side input
+    _operation = "BitwiseOr", np.bitwise_or, "({0} | {1})", None
+
+
+# Derive a specialization to implement elementwise bitwise xor of two inputs
+@register_custom_op
+class ElementwiseBitwiseXor(ElementwiseBinaryOperation):
+    # Specialize to implement the bitwise xor operation of left hand side and
+    # right hand side input
+    _operation = "BitwiseXor", np.bitwise_xor, "({0} ^ {1})", None
+
+
+# TODO: ElementwiseBitShift - Requires extra attribute selecting the direction
+
+
+# # Derive a specialization to implement elementwise power of two inputs
+# TODO: std::pow does not work for HLS types and hls::pow fails to link for some
+#  reason
+# @register_custom_op
+# class ElementwisePow(ElementwiseBinaryOperation):
+#     # Specialize to implement the power operation of left hand side and
+#     # right hand side input
+#     _operation = "Pow", np.power, "(std::pow({0}, {1}))", None
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..3fb958a99e 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -26,6 +26,37 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# The base class of all HWCustomOp specializations to HLS backend implementation
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+# The base class of all generic custom operations before specializing to either
+# HLS or RTL backend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Dictionary of HLSBackend implementations
+custom_op = dict()
+
+
+# Registers a class into the custom_op dictionary
+# Note: This must be defined first, before importing any custom op
+# implementation to avoid "importing partially initialized module" issues.
+def register_custom_op(cls):
+    # The class must actually implement HWCustomOp
+    assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
+    # The class must also implement the HLSBackend
+    assert issubclass(cls, HLSBackend), f"{cls} must subclass {HLSBackend}"
+    # Insert the class into the custom_op dictionary by its name
+    custom_op[cls.__name__] = cls  # noqa: Some weird type annotation issue?
+    # Pass through the class unmodified
+    return cls
+
+
+# flake8: noqa
+# Disable linting from here, as all import will be flagged E402 and maybe F401
+
+# Import the submodule containing specializations of ElementwiseBinaryOperation
+# Note: This will automatically register all decorated classes into this domain
+import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls
 from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls
 from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
 from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
@@ -53,8 +84,6 @@
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
 from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls
 
-custom_op = dict()
-
 # make sure new HLSCustomOp subclasses are imported here so that they get
 # registered and plug in correctly into the infrastructure
 custom_op["AddStreams_hls"] = AddStreams_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
new file mode 100644
index 0000000000..09bf77a678
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
@@ -0,0 +1,703 @@
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Numpy math and arrays
+import numpy as np
+
+# Operating system stuff, e.g. paths
+import os
+
+# Cleanup post-processing of generated code
+import textwrap
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Specializations of the generic HW operator
+import finn.custom_op.fpgadataflow.elementwise_binary as elementwise_binary
+
+# Utility for registering HLSBackend HWCustomOp implementations into the module
+# scope
+from finn.custom_op.fpgadataflow.hls import register_custom_op
+
+# Base class for specializing HW operators as implemented via HLS
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+# Convert and pack (numpy) data for C++ code generation
+from finn.util.data_packing import numpy_to_hls_code
+
+# The generic HW custom operator version of the operator as a base class
+from finn.custom_op.fpgadataflow.elementwise_binary import (  # noqa
+    ElementwiseBinaryOperation
+)
+
+
+# HLS Backend specialization of the binary elementwise operation operator
+class ElementwiseBinaryOperation_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation, HLSBackend
+):
+    # Node attributes matching the HLS operator
+    def get_nodeattr_types(self):
+        # Start from parent operator class attributes
+        attrs = ElementwiseBinaryOperation.get_nodeattr_types(self)
+        # Add the HLSBackend default attributes on top
+        attrs.update(HLSBackend.get_nodeattr_types(self))
+        # Add/Specialize implementation specific attributes here...
+        # Return the updated attributes dictionary
+        return attrs
+
+    # Executes elementwise operation in C++ simulation
+    def _execute_node_cppsim(self, context, graph):  # noqa: graph unused
+        # Get the node wrapped by this custom op
+        node = self.onnx_node
+        # Input data is stored in numpy files in the code generation dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        # Get the inputs out of the execution context
+        lhs = context[node.input[0]]  # noqa: Duplicate code prepare simulation
+        rhs = context[node.input[1]]
+        # Validate the shape of the inputs
+        assert list(lhs.shape) == self.get_normal_input_shape(ind=0), \
+            f"Input shape mismatch for {node.input[0]}"
+        assert list(rhs.shape) == self.get_normal_input_shape(ind=1), \
+            f"Input shape mismatch for {node.input[1]} {rhs.shape=}"
+        # Reshape the inputs into folded form
+        lhs = lhs.reshape(self.get_folded_input_shape(ind=0))
+        rhs = rhs.reshape(self.get_folded_input_shape(ind=1))
+        # Save the folded inputs to file to be used by simulation
+        np.save(os.path.join(code_gen_dir, "lhs.npy"), lhs)
+        np.save(os.path.join(code_gen_dir, "rhs.npy"), rhs)
+
+        # Execute the precompiled model
+        super().exec_precompiled_singlenode_model()
+
+        # Load the output numpy file generated by the C++ simulation
+        out = np.load(os.path.join(code_gen_dir, "out.npy"))
+        # Reshape the folded output and insert into the execution context
+        context[node.output[0]] = out.reshape(
+            self.get_normal_output_shape(ind=0)
+        )
+
+    # Maximum width of any ap_int used in this operator
+    def get_ap_int_max_w(self):
+        # Find the widths of the widest of the two inputs
+        i_bits_max = max(
+            self.get_instream_width(ind=0),
+            self.get_instream_width(ind=1)
+        )
+        # Width of the output, there is just one output
+        # Note: there is one output per replica
+        o_bits_max = self.get_outstream_width(ind=0)
+        # Find the biggest of the inputs/outputs
+        return max([i_bits_max, o_bits_max])
+
+    # Note: End of shape and datatype utilities
+
+    # Generates list of C++ includes to be placed at the top of the generated
+    # code
+    def global_includes(self):
+        # Currently nothing to include
+        self.code_gen_dict["$GLOBALS$"] = ['#include "flatten.hpp"']
+
+    # Generates C++ parameters file, i.e., constant initializer inputs
+    def generate_params(self, model: ModelWrapper, path: str):
+        # The code generation directory is specified as an argument, so this
+        # will work for both RTL and C++ simulation
+        code_gen_dir = path
+        # By default, assume runtime inputs not requiring code to be generated
+        lhs_code = rhs_code = ""
+        # Check for an initializer providing the left hand side input
+        lhs = model.get_initializer(self.onnx_node.input[0])
+        # Folded output shape for broadcasting/aligning the input shapes
+        out_shape = self.get_folded_output_shape(ind=0)
+        # If the left hand side input is provided as initializer, generate
+        # initializer parameters code
+        if lhs is not None:
+            # Remember the "style" of receiving the input for further code
+            # generation
+            self.set_nodeattr("lhs_style", "const")
+            # Reshape the parameter tensor into folded shape
+            lhs = lhs.reshape(*self.get_folded_input_shape(ind=0))
+            # Need to make sure there are PE many elements which can be accessed
+            # in parallel
+            if lhs.shape[-1] != self.pe:  # noqa: Duplicate
+                # Broadcast the parameter tensor "offline" to have PE elements
+                # TODO: This replicates all parameters and might be inefficient
+                #  in terms of memory utilization. It might be ore efficient to
+                #  replicate the PEs when needed in docompute, probably at the
+                #  cost of some latency for extra reads and registers.
+                lhs = np.broadcast_to(lhs, lhs.shape[:-1] + (self.pe,))
+            # Current, maybe non-aligned input shape
+            lhs_shape = lhs.shape
+            # Fill up shape from the left to match the broadcast output shape
+            lhs_shape = (len(out_shape) - len(lhs_shape)) * (1,) + lhs_shape
+            # Reshape the input to align with the output shape
+            lhs = lhs.reshape(*lhs_shape)
+            # Generate C++ array initialization code
+            # Note: no packing, but with variable name/type declaration
+            lhs_code = numpy_to_hls_code(
+                lhs, self.lhs_dtype, "lhs", False, False
+            )
+
+        # Check for an initializer providing the right hand side input
+        rhs = model.get_initializer(self.onnx_node.input[1])
+        # If the right hand side input is provided as initializer, generate
+        # initializer parameters code
+        if rhs is not None:
+            # Remember the "style" of receiving the input for further code
+            # generation
+            self.set_nodeattr("rhs_style", "const")
+            # Reshape the parameter tensor into folded shape
+            rhs = rhs.reshape(*self.get_folded_input_shape(ind=1))
+            # Need to make sure there are PE many elements which can be accessed
+            # in parallel
+            if rhs.shape[-1] != self.pe:  # noqa: Duplicate
+                # Broadcast the parameter tensor "offline" to have PE elements
+                # TODO: This replicates all parameters and might be inefficient
+                #  in terms of memory utilization. It might be ore efficient to
+                #  replicate the PEs when needed in docompute, probably at the
+                #  cost of some latency for extra reads and registers.
+                rhs = np.broadcast_to(rhs, rhs.shape[:-1] + (self.pe,))
+            # Current, maybe non-aligned input shape
+            rhs_shape = rhs.shape
+            # Fill up shape from the left to match the broadcast output shape
+            rhs_shape = (len(out_shape) - len(rhs_shape)) * (1,) + rhs_shape
+            # Reshape the input to align with the output shape
+            rhs = rhs.reshape(*rhs_shape)
+            # Generate C++ array initialization code
+            # Note: no packing, but with variable name/type declaration
+            rhs_code = numpy_to_hls_code(
+                rhs, self.rhs_dtype, "rhs", False, False
+            )
+
+        # Open a file to store the thresholds parameters as C++ code
+        with open(f"{code_gen_dir}/params.hpp", "w") as file:
+            # Write lines of C++ code separated by newlines to the file
+            file.write("\n".join([
+                # Insert left-hand-side and right-hand-side parameter code and
+                # append a newline at the end of the file (to avoid problems
+                # when including, required by C standard?)
+                lhs_code, rhs_code, "\n"
+            ]))
+
+    # Generates C++ code of type alias, global constant and macro definitions
+    def defines(self, var):
+        # Insert constants and type aliases into the dictionary
+        self.code_gen_dict["$DEFINES$"] = [
+            # Input and output element datatypes
+            f"using LhsType = {self.lhs_dtype.get_hls_datatype_str()};",
+            f"using RhsType = {self.rhs_dtype.get_hls_datatype_str()};",
+            f"using OutType = {self.out_dtype.get_hls_datatype_str()};",
+            # Width of single elements to avoid using ::width attribute which is
+            # not present for datatype float
+            f"static constexpr auto LhsWidth = {self.lhs_dtype.bitwidth()};",
+            f"static constexpr auto RhsWidth = {self.rhs_dtype.bitwidth()};",
+            f"static constexpr auto OutWidth = {self.out_dtype.bitwidth()};",
+            # Datatype of elements packed into the input stream
+            f"using LhsPacked = ap_uint<{self.get_instream_width(ind=0)}>;",
+            f"using RhsPacked = ap_uint<{self.get_instream_width(ind=1)}>;",
+            # Datatype of elements packed into the output stream
+            f"using OutPacked = ap_uint<{self.get_outstream_width(ind=0)}>;",
+            # Include the activation function type definitions and parameters
+            #   Note: The typedefs in this header require the typedefs above,
+            #   thus adding this to the global includes is not possible.
+            '#include "params.hpp"',
+            # Input and output HLS stream datatypes
+            "using LhsStream = hls::stream<"
+            f"  ap_uint<{self.get_instream_width(ind=0)}>"
+            ">;",
+            "using RhsStream = hls::stream<"
+            f"  ap_uint<{self.get_instream_width(ind=1)}>"
+            ">;",
+            "using OutStream = hls::stream<"
+            f"  ap_uint<{self.get_outstream_width(ind=0)}>"
+            ">;",
+        ]
+
+    # Generates C++ code for reading data from .npy (numpy format) for testing
+    # in C++ simulation
+    def read_npy_data(self):
+        # Input data is stored in numpy files in the code generation dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        # Prepare empty stream reading to append optionals
+        self.code_gen_dict["$READNPYDATA$"] = []
+        # If the left-hand-side is provided as runtime input, read code needs
+        # to be generated
+        if self.lhs_style == "input":
+            # Generate function calls for reading the input files into the input
+            # streams
+            self.code_gen_dict["$READNPYDATA$"] += [
+                # Generate function call reading from file into the input stream
+                #   Note: Inputs are always represented as numpy floats
+                'npy2apintstream<LhsPacked, LhsType, LhsWidth, float>(',
+                f'"{code_gen_dir}/lhs.npy", lhs_{self.hls_sname()}, false',
+                ');'
+            ]
+        # If the right-hand-side is provided as runtime input, read code needs
+        # to be generated
+        if self.rhs_style == "input":
+            # Generate function calls for reading the input files into the input
+            # streams
+            self.code_gen_dict["$READNPYDATA$"] += [
+                # Generate function call reading from file into the input stream
+                #   Note: Inputs are always represented as numpy floats
+                'npy2apintstream<RhsPacked, RhsType, RhsWidth, float>(',
+                f'"{code_gen_dir}/rhs.npy", rhs_{self.hls_sname()}, false',
+                ');'
+            ]
+
+    # Generates C++ code for declaring all streams involved in C++ simulation
+    # for testing
+    def strm_decl(self):
+        # Allways add the output stream to the declarations
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = [
+            # Note: Assumes stream type aliases to be set in defines
+            f"OutStream out_{self.hls_sname()};"
+        ]
+        # If the left-hand-side is provided as runtime input, read code needs
+        # to be generated
+        if self.lhs_style == "input":
+            # Generate a stream declaration
+            self.code_gen_dict["$STREAMDECLARATIONS$"] += [
+                # Note: Assumes stream type aliases to be set in defines
+                f"LhsStream lhs_{self.hls_sname()};"
+            ]
+        # If the right-hand-side is provided as runtime input, read code needs
+        # to be generated
+        if self.rhs_style == "input":
+            # Generate a stream declaration
+            self.code_gen_dict["$STREAMDECLARATIONS$"] += [
+                # Note: Assumes stream type aliases to be set in defines
+                f"RhsStream rhs_{self.hls_sname()};"
+            ]
+
+    # Generates C++ code for calling the computation part of the operator
+    def docompute(self):
+        # Add padding ones to a shape to match the broadcast output shape
+        def pad_shape(shape):
+            return (len(out_shape) - len(shape)) * (1,) + shape
+
+        # Get the folded shapes of all tensors involved without PE axis
+        lhs_shape = self.get_folded_input_shape(ind=0)[:-1]
+        rhs_shape = self.get_folded_input_shape(ind=1)[:-1]
+        out_shape = self.get_folded_output_shape(ind=0)[:-1]
+        # Expanded shape of the inputs, filling with dimensions of size 1 from
+        # the left to align the shape with the broadcast shape
+        lhs_shape = pad_shape(lhs_shape)
+        rhs_shape = pad_shape(rhs_shape)
+
+        # Removes contiguous matching dimensions from a shape
+        def drop_matching_dims(shape, like):
+            # Core functionality for this is implemented in itertools
+            from itertools import dropwhile
+
+            # Compare shapes from left to right removing dimensions as long as
+            # they match
+            return *[
+                size for size, _ in dropwhile(
+                    lambda x: x[0] == x[1], zip(shape, like)
+                )
+            ],
+
+        # Take away all contiguous dimensions where these align with the output
+        # shape, as these can be consumed directly without buffering to be
+        # repeated
+        lhs_buffer_shape = drop_matching_dims(lhs_shape, out_shape)
+        rhs_buffer_shape = drop_matching_dims(rhs_shape, out_shape)
+        # Expand once again, filling with dimensions of size 1 from the left to
+        # align the shape with the broadcast shape
+        lhs_buffer_shape = pad_shape(lhs_buffer_shape)
+        rhs_buffer_shape = pad_shape(rhs_buffer_shape)
+
+        # Code generation of array index strings with broadcasting
+        def make_index_string(shape):
+            # Generate index operation [i] for "normal" dimensions but reduce to
+            # hardcoded [0] for broadcast dimensions to repeat from a single
+            # buffer slot
+            return "".join([
+                f"[i{d}]" if s != 1 else "[0]" for d, s in enumerate(shape)
+            ])
+
+        # Generate the C++ code for indexing the buffers
+        lhs_index = {
+            "input": make_index_string(lhs_buffer_shape),
+            "const": make_index_string(lhs_shape)
+        }[self.lhs_style]
+        rhs_index = {
+            "input": make_index_string(rhs_buffer_shape),
+            "const": make_index_string(rhs_shape)
+        }[self.rhs_style]
+
+        # Generate C++ code for declaring an array of the buffer shapes
+        lhs_buffer_shape = "".join([f'[{size}]' for size in lhs_buffer_shape])
+        rhs_buffer_shape = "".join([f'[{size}]' for size in rhs_buffer_shape])
+
+        # For-Loop template for nested loops over arbitrary many levels
+        def for_loop(level, size):
+            return f"for(std::size_t i{level} = 0; i{level}<{size}; ++i{level})"
+
+        # Generate code testing for the condition when the next element needs to
+        # be read from the input stream according to broadcasting semantics
+        def read_stream_condition(shape):
+            # Start with the assumption that none of the dimensions is
+            # broadcast, meaning each individual element needs to be read from
+            # the stream
+            condition = "true"
+            # Search for the dimensions which are broadcast
+            for dim, size in enumerate(shape):
+                # If this dimension has a size of 1 in the input but not in the
+                # output, it is broadcast and contributes to the conjunctive
+                # reading condition if this index wraps around
+                if size == 1 and out_shape[dim] != 1:
+                    # Add testing for index wrap-around to the condition
+                    condition += f" && (i{dim} == 0)"
+            # Return the composed reading condition
+            return condition
+
+        # Generate code for unpacking elements read from the stream into the PE-
+        # parallel buffer according to broadcasting semantics
+        def unpack_buffer(shape):
+            # Unpacking behavior depends on whether the last, i.e., folded PE
+            # dimension is broadcast
+            if shape[-1] == 1 and self.pe != self.out_shape[-1]:
+                # PE axis is broadcast, i.e., slice yields just one element
+                # which needs to be replicated
+                return "buffer(0, 0)"
+            # PE axis is not broadcast, i.e., slice actually yields parallel
+            # elements to be unpacked
+            return "buffer(pe, 0)"
+
+        # Write the body of the top-level function
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            # @formatter:off  Disable formatter for mixed Python and C++
+            # For streamed inputs, generate local buffer of non-broadcast size
+            # but broadcasts dimensions un-squeezed to size 1. For constant
+            # inputs, use the generated parameters of the same name.
+            f"""
+            LhsType lhs{lhs_buffer_shape}[{self.pe}];
+            """ if self.lhs_style == "input" else """""",
+            f"""
+            RhsType rhs{rhs_buffer_shape}[{self.pe}];
+            """ if self.rhs_style == "input" else """""",
+            # Buffer to hold the parallel output elements
+            f"""
+            OutType out[{self.pe}];
+            """,
+            # Perfect loop nest over all folded output dimensions
+            *[for_loop(dim, size) + " {" for dim, size in enumerate(out_shape)],
+            # Pipeline the loops. This should be possible as there is no code
+            # between the loop levels, i.e., this is a perfect loop nest.
+            """
+            #pragma HLS pipeline II=1 style=flp
+            """,
+            # Read from the left-hand-side input stream if new elements are
+            # needed according to broadcasting semantics
+            f"""
+            if({read_stream_condition(lhs_shape)}) {{
+                const auto buffer = Slice<LhsType>{{}}(
+                    lhs_{self.hls_sname()}.read()
+                );
+                for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+                #pragma HLS unroll
+                    lhs{lhs_index}[pe] = {unpack_buffer(lhs_shape)};
+                }}
+            }}
+            """ if self.lhs_style == "input" else """""",
+            # Read from the right-hand-side input stream if new elements are
+            # needed according to broadcasting semantics
+            f"""
+            if({read_stream_condition(rhs_shape)}) {{
+                const auto buffer = Slice<RhsType>{{}}(
+                    rhs_{self.hls_sname()}.read()
+                );
+                for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+                #pragma HLS unroll
+                    rhs{rhs_index}[pe] = {unpack_buffer(rhs_shape)};
+                }}
+            }}
+            """ if self.rhs_style == "input" else """""",
+            # Apply PE parallel elementwise operations by filling the operation
+            # template
+            f"""
+            for(std::size_t pe = 0; pe < {self.pe}; ++pe) {{
+            #pragma HLS unroll
+                out[pe] = {self.cpp_op.format(
+                    f"lhs{lhs_index}[pe]", f"rhs{rhs_index}[pe]"
+                )};
+            }}
+            """,
+            # Write the PE group into the output stream
+            f"""
+            out_{self.hls_sname()}.write(flatten<{self.pe}>(out));
+            """,
+            # Close all for-loop bodies of the generated nest
+            *["}" for _ in enumerate(out_shape)]
+            # @formatter:on  End of code generation
+        ]
+
+        # Post-process the generated code to remove unnecessary white space
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            textwrap.dedent(code) for code in self.code_gen_dict["$DOCOMPUTE$"]
+        ]
+
+    # Generates C++ code for reading the output stream and converting back to
+    # numpy format for testing in C** simulation
+    def dataoutstrm(self):
+        # Output data will be stored in numpy files in the code generation
+        # dictionary
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        # Get the expected shape of the folded output array formatted as a C++
+        # vector initializer
+        # Note: Valid formatting relies on correct placement of curly braces
+        # and line breaks: Open/close all three braces on the same line of code
+        # to avoid '\n' to be inserted into the string
+        shape = f"""{{{
+        ','.join((str(i) for i in self.get_folded_output_shape(ind=0)))
+        }}}"""
+        # Generate function call for reading from the output stream into the
+        # output file
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            # Generate function call reading from stream into the output file
+            #   Note: Outputs are always represented as numpy floats
+            'apintstream2npy<OutPacked, OutType, OutWidth, float>(',
+            f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false',
+            ');',
+        ]
+
+    # Generates C++ code for saving the output of C++ simulation to a file in
+    # numpy format
+    def save_as_npy(self):
+        # Note: This seems to be empty in ALL HLSBackends. Probably it was used
+        # for something before, which is now integrated into dataoutstrm()?
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    # Generates essentially the head of the C++ function from which the IP block
+    # will be generated during ipgen, i.e. actual synthesis
+    def blackboxfunction(self):
+        # Check whether the inputs are provided at runtime to generate stream
+        # inputs to the toplevel interface
+        runtime_lhs = self.lhs_style == "input"
+        runtime_rhs = self.rhs_style == "input"
+        # Insert function head describing the top level interface of the
+        # attention operator
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            # Note: Assumes stream type aliases to be set in defines
+            f"void {self.onnx_node.name} (",
+            f"  LhsStream &lhs_{self.hls_sname()}," if runtime_lhs else "",
+            f"  RhsStream &rhs_{self.hls_sname()}," if runtime_rhs else "",
+            f"  OutStream &out_{self.hls_sname()}",
+            ")",
+        ]
+
+    # Generates C++ pragmas to be inserted into the main function of the C++
+    # simulation and the ipgen-blackboxfunction as well
+    def pragmas(self):
+        # Add HLS interface directives specifying how to create RTL ports for
+        # the top-level function arguments
+        self.code_gen_dict["$PRAGMAS$"] = [
+            # Connect the output stream with an axi stream interface
+            f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}",
+        ]
+
+        # If the left-hand-side is provided as runtime input interface pragmas
+        # need to be inserted
+        if self.lhs_style == "input":
+            # Connect the lhs input stream with an axi stream interface
+            self.code_gen_dict["$PRAGMAS$"] += [
+                f"#pragma HLS INTERFACE axis port=lhs_{self.hls_sname()}",
+            ]
+
+        # If the right-hand-side is provided as runtime input interface pragmas
+        # need to be inserted
+        if self.rhs_style == "input":
+            # Connect the rhs input stream with an axi stream interface
+            self.code_gen_dict["$PRAGMAS$"] += [
+                f"#pragma HLS INTERFACE axis port=rhs_{self.hls_sname()}",
+            ]
+
+        # No block-level I/O protocol for the function return value
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    # Returns the names of input and output interfaces grouped by protocol
+    def get_verilog_top_module_intf_names(self):
+        # Start collecting interface names in a dictionary starting with clock
+        # and reset
+        intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]}  # noqa
+        # AXI stream input interfaces
+        intf_names["s_axis"] = []
+        # If the left-hand-side is provided as runtime input interface names
+        # need to be inserted
+        if self.lhs_style == "input":
+            intf_names["s_axis"] += [(
+                f"lhs_{self.hls_sname()}", self.get_instream_width_padded(ind=0)
+            )]
+        # If the right-hand-side is provided as runtime input interface names
+        # need to be inserted
+        if self.rhs_style == "input":
+            intf_names["s_axis"] += [(
+                f"rhs_{self.hls_sname()}", self.get_instream_width_padded(ind=1)
+            )]
+        # AXI stream output interfaces
+        intf_names["m_axis"] = [
+            (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0))
+        ]
+        # No AXI-MM, AXI-Lite or protocol-less interfaces
+        intf_names["aximm"] = []
+        intf_names["axilite"] = []
+        intf_names["ap_none"] = []
+        # Return the interface name dictionary
+        return intf_names
+
+
+# Derive a specialization to implement elementwise addition of two inputs
+@register_custom_op  # noqa: PyCharm sees all these specializations as duplicate
+class ElementwiseAdd_hls(  # noqa: Class name does not follow
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseAdd
+):
+    pass
+
+
+# Derive a specialization to implement elementwise subtraction of two inputs
+@register_custom_op
+class ElementwiseSub_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseSub
+):
+    pass
+
+
+# Derive a specialization to implement elementwise multiplication of two inputs
+@register_custom_op
+class ElementwiseMul_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseMul
+):
+    pass
+
+
+# Derive a specialization to implement elementwise division of two inputs
+@register_custom_op
+class ElementwiseDiv_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseDiv
+):
+    pass
+
+
+# TODO: ElementwiseMod_hls - Requires extra attribute selecting the function
+
+# Derive a specialization to implement elementwise logical and of two inputs
+@register_custom_op
+class ElementwiseAnd_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseAnd
+):
+    pass
+
+
+# Derive a specialization to implement elementwise logical or of two inputs
+@register_custom_op
+class ElementwiseOr_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseOr
+):
+    pass
+
+
+# Derive a specialization to implement elementwise logical xor of two inputs
+@register_custom_op
+class ElementwiseXor_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseXor
+):
+    pass
+
+
+# Derive a specialization to implement elementwise equal of two inputs
+@register_custom_op  # noqa: PyCharm sees all these specializations as duplicate
+class ElementwiseEqual_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseEqual
+):
+    pass
+
+
+# Derive a specialization to implement elementwise less of two inputs
+@register_custom_op
+class ElementwiseLess_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseLess
+):
+    pass
+
+
+# Derive a specialization to implement elementwise less or equal of two inputs
+@register_custom_op
+class ElementwiseLessOrEqual_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseLessOrEqual
+):
+    pass
+
+
+# Derive a specialization to implement elementwise greater of two inputs
+@register_custom_op
+class ElementwiseGreater_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseGreater
+):
+    pass
+
+
+# Derive a specialization to implement elementwise greater or equal of two
+# inputs
+@register_custom_op
+class ElementwiseGreaterOrEqual_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseGreaterOrEqual
+):
+    pass
+
+
+# Derive a specialization to implement elementwise bitwise and of two inputs
+@register_custom_op
+class ElementwiseBitwiseAnd_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseAnd
+):
+    pass
+
+
+# Derive a specialization to implement elementwise bitwise or of two inputs
+@register_custom_op
+class ElementwiseBitwiseOr_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseOr
+):
+    pass
+
+
+# Derive a specialization to implement elementwise bitwise xor of two inputs
+@register_custom_op
+class ElementwiseBitwiseXor_hls(  # noqa: Class name does not follow
+    # CapWords convention
+    ElementwiseBinaryOperation_hls, elementwise_binary.ElementwiseBitwiseXor
+):
+    pass
+
+# TODO: ElementwiseBitShift_hls - Requires extra attribute selecting the
+#  direction
+
+
+# # Derive a specialization to implement elementwise power of two inputs
+# TODO: std::pow does not work for HLS types and hls::pow fails to link for some
+#  reason
+# @register_custom_op
+# class ElementwisePow_hls(  # noqa: Class name does not follow
+#     # CapWords convention
+#     ElementwiseBinaryOperation_hls, elementwise_binary.ElementwisePow
+# ):
+#     pass
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..e5787bfd2a 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -29,6 +29,7 @@
 
 # template for single node execution
 docompute_template = """
+#define HLS_CONSTEXPR_ENABLE
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
@@ -62,6 +63,7 @@
 
 # cpp file
 ipgen_template = """
+#define HLS_CONSTEXPR_ENABLE
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 
 #include "bnn-library.h"
diff --git a/tests/fpgadataflow/test_elementwise_binary.py b/tests/fpgadataflow/test_elementwise_binary.py
new file mode 100644
index 0000000000..5322c15069
--- /dev/null
+++ b/tests/fpgadataflow/test_elementwise_binary.py
@@ -0,0 +1,363 @@
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Testing framework
+import pytest
+
+# Numpy math and arrays
+import numpy as np
+
+# ONNX graph and tensor utility
+from onnx import TensorProto
+from onnx import helper as oh
+
+# QONNX/FINN datatypes
+from qonnx.core.datatype import DataType
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Execute onnx model graphs
+from qonnx.core.onnx_exec import execute_onnx
+
+# Registry of all QONNX CustomOps
+from qonnx.custom_op.registry import getCustomOp
+
+# Graph transformation giving unique names to each node in a QONNX model graph
+from qonnx.transformation.general import GiveUniqueNodeNames
+
+# QONNX graph transformations for inferring datatypes and shapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+
+# Utility for wrapping onnx graphs and generating tensor of FINN datatypes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+# FINN graph transformations for preparing simulation (cppsim or rtlsim)
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+
+# Specializes all nodes to be implemented as HLS backend
+def specialize_hls(model: ModelWrapper):
+    # Mark all nodes to be specialized as HLS backend implementations
+    for node in model.graph.node:  # noqa: Duplicate test setup code
+        # Get the CustomOp instance of the node to get access to the node
+        # attributes
+        inst = getCustomOp(node)
+        # Note: only HLS-based layers execute C++ Simulation
+        inst.set_nodeattr("preferred_impl_style", "hls")
+    # Turn all HWCustomOp layers into HLS specializations
+    return model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
+
+
+# Mapping of ElementwiseBinaryOperation specializations to numpy reference
+# implementation functions
+NUMPY_REFERENCES = {
+    "ElementwiseAdd": np.add,
+    "ElementwiseSub": np.subtract,
+    "ElementwiseMul": np.multiply,
+    # TODO: "ElementwiseDiv": np.divide, Cannot guarantee non-zero test input
+    # TODO: "ElementwiseMod": np.mode / np.fmod
+    "ElementwiseAnd": np.logical_and,
+    "ElementwiseOr": np.logical_or,
+    "ElementwiseXor": np.logical_xor,
+    "ElementwiseEqual": np.equal,
+    "ElementwiseLess": np.less,
+    "ElementwiseLessOrEqual": np.less_equal,
+    "ElementwiseGreater": np.greater,
+    "ElementwiseGreaterOrEqual": np.greater_equal,
+    "ElementwiseBitwiseAnd": np.bitwise_and,
+    "ElementwiseBitwiseOr": np.bitwise_or,
+    "ElementwiseBitwiseXor": np.bitwise_xor,
+    # TODO: "ElementwiseBitShift": np.left_shift / np.right_shift
+    # TODO: "ElementwisePow": np.power
+}
+
+
+# Creates a model executing a binary elementwise operation
+def mock_elementwise_binary_operation(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+):
+    # Automatically derive the output shape by broadcasting the inputs
+    out_shape = np.broadcast_shapes(lhs_shape, rhs_shape)
+    # Create a node representing the binary elementwise operation
+    node = oh.make_node(
+        # Operator type from the name of the fpgadataflow hlscustomop
+        op_type=op_type,
+        # Specify the domain, i.e., the package to look for the custom operator
+        # implementation
+        domain="finn.custom_op.fpgadataflow",
+        # Execution backend: Required attribute inherited from HLSCustomOp
+        backend="fpgadataflow",
+        # Just one input
+        inputs=["lhs", "rhs"],
+        # Enumerate the outputs
+        outputs=["out"],
+        # Data type of the left-hand-side input elements
+        lhs_dtype=lhs_dtype,
+        # Data type of the right-hand-side input elements
+        rhs_dtype=rhs_dtype,
+        # Data type of the output elements
+        out_dtype=out_dtype,
+        # Shape of the left-hand-side input
+        lhs_shape=lhs_shape,
+        # Shape of the right-hand-side input
+        rhs_shape=rhs_shape,
+        # Shape of the output, mus correspond to multi-directional
+        # broadcasting of the left- and right-hand-side
+        out_shape=out_shape,
+        # Number of elements to process in parallel
+        PE=pe,
+    )
+    # Construct the input tensor value infos
+    lhs = oh.make_tensor_value_info("lhs", TensorProto.FLOAT, lhs_shape)
+    rhs = oh.make_tensor_value_info("rhs", TensorProto.FLOAT, rhs_shape)
+    # Construct output tensor value infos
+    out = oh.make_tensor_value_info("out", TensorProto.FLOAT, out_shape)
+    # Create a graph connecting the node to the inputs and outputs
+    graph = oh.make_graph(
+        [node], inputs=[lhs, rhs], outputs=[out], name="elementwise-binary"
+    )
+    # Wrap the ONNX graph in QONNX model wrapper
+    model = ModelWrapper(
+        qonnx_make_model(graph, producer_name="elementwise-binary")
+    )
+
+    # Add datatype annotation to the value info of input tensors
+    model.set_tensor_datatype("lhs", DataType[lhs_dtype])
+    model.set_tensor_datatype("rhs", DataType[rhs_dtype])
+    model.set_tensor_datatype("out", DataType[out_dtype])
+
+    # Return the wrapped onnx model
+    return model
+
+
+# Operator type to be tested
+@pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+    # Test all Numpy references specified above
+    *NUMPY_REFERENCES.keys()
+])
+# Data type of the left-hand-side input elements
+@pytest.mark.parametrize("lhs_dtype", ["INT8"])
+# Data type of the right-hand-side input elements
+@pytest.mark.parametrize("rhs_dtype", ["INT8"])
+# Data type of the output elements
+@pytest.mark.parametrize("out_dtype", ["INT32"])
+# Shape of the left-hand-side input
+@pytest.mark.parametrize("lhs_shape", [
+    [3, 1, 7, 1], [1]
+])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [
+    [3, 32, 1, 16],
+])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [
+    [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+def test_elementwise_binary_operation_python(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+        initializers
+):
+    # Make dummy model for testing
+    model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+    )
+    # Prepare the execution context
+    context = {
+        "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+        "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+    }
+
+    # Turn selected inputs into initializers
+    for name in initializers:
+        model.set_initializer(name, context[name])
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
+
+    # Test running shape and data type inference on the model graph
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferShapes())
+    # Set model execution mode to python simulation
+    model = model.transform(SetExecMode("python"))
+    model = model.transform(GiveUniqueNodeNames())
+
+    # Compute ground-truth output in software
+    o_expected = numpy_reference(
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Assume out_type to be always of the same kind as the inputs but
+        # with bit-width >= the bit-width of either of the inputs. Then,
+        # representing the inputs as out_type for numpy simulation is safe.
+        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+    )
+    # Execute the onnx model to collect the result
+    o_produced = execute_onnx(model, context)["out"]
+
+    # Compare the expected to the produced for exact equality
+    assert np.all(o_produced == o_expected)
+
+
+# Operator type to be tested
+@pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+    # Test all Numpy references specified above
+    *NUMPY_REFERENCES.keys(),
+])
+# Data type of the left-hand-side input elements
+@pytest.mark.parametrize("lhs_dtype", ["INT8"])
+# Data type of the right-hand-side input elements
+@pytest.mark.parametrize("rhs_dtype", ["INT8"])
+# Data type of the output elements
+@pytest.mark.parametrize("out_dtype", ["INT32"])
+# Shape of the left-hand-side input
+@pytest.mark.parametrize("lhs_shape", [
+    [3, 1, 7, 1], [1]
+])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [
+    [3, 32, 1, 16],
+])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [
+    [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# This is a slow running fpgadataflow type of test which requires vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+def test_elementwise_binary_operation_cppsim(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+        initializers
+):
+    # Make dummy model for testing
+    model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+    )
+    # Prepare the execution context
+    context = {
+        "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+        "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+    }
+
+    # Turn selected inputs into initializers
+    for name in initializers:
+        model.set_initializer(name, context[name])
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
+
+    # Test running shape and data type inference on the model graph
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferShapes())
+    # Specializes all nodes to be implemented as HLS backend
+    model = specialize_hls(model)
+    # Set model execution mode to C++ simulation
+    model = model.transform(SetExecMode("cppsim"))
+    # Generates the C++ source and compiles the C++ simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+
+    # Compute ground-truth output in software
+    o_expected = numpy_reference(
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Assume out_type to be always of the same kind as the inputs but
+        # with bit-width >= the bit-width of either of the inputs. Then,
+        # representing the inputs as out_type for numpy simulation is safe.
+        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+    )
+    # Execute the onnx model to collect the result
+    o_produced = execute_onnx(model, context)["out"]
+
+    # Compare the expected to the produced for exact equality
+    assert np.all(o_produced == o_expected)
+
+
+# Operator type to be tested
+@pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+    # Test all Numpy references specified above
+    *NUMPY_REFERENCES.keys()
+])
+# Data type of the left-hand-side input elements
+@pytest.mark.parametrize("lhs_dtype", ["INT8"])
+# Data type of the right-hand-side input elements
+@pytest.mark.parametrize("rhs_dtype", ["INT8"])
+# Data type of the output elements
+@pytest.mark.parametrize("out_dtype", ["INT32"])
+# Shape of the left-hand-side input
+@pytest.mark.parametrize("lhs_shape", [
+    [3, 1, 7, 1], [1]
+])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [
+    [3, 32, 1, 16],
+])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [
+    [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# This is a slow running fpgadataflow type of test which requires vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+def test_elementwise_binary_operation_rtlsim(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+        initializers
+):
+    # Make dummy model for testing
+    model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+    )
+    # Prepare the execution context
+    context = {
+        "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+        "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+    }
+
+    # Turn selected inputs into initializers
+    for name in initializers:
+        model.set_initializer(name, context[name])
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
+
+    # Test running shape and data type inference on the model graph
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferShapes())
+    # Specializes all nodes to be implemented as HLS backend
+    model = specialize_hls(model)
+    # Set model execution mode to RTL simulation
+    model = model.transform(SetExecMode("rtlsim"))
+    # Generates the C++ source and compiles the RTL simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10))  # noqa
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+
+    # Compute ground-truth output in software
+    o_expected = numpy_reference(
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Assume out_type to be always of the same kind as the inputs but
+        # with bit-width >= the bit-width of either of the inputs. Then,
+        # representing the inputs as out_type for numpy simulation is safe.
+        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+    )
+    # Execute the onnx model to collect the result
+    o_produced = execute_onnx(model, context)["out"]
+
+    # Compare the expected to the produced for exact equality
+    assert np.all(o_produced == o_expected)

From f61a5388675ea8b81f5fa85bedc20ea697b7309a Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Mon, 13 Nov 2023 15:01:37 +0100
Subject: [PATCH 02/26] [Streamline] Fix FoldQuantWeights input order and shape
 annotations

Folding quantized initializers into add-like nodes did not repsect the
order of inputs to the add node correctly. This is fixed by testing for
one of the two possible orders and selecting the following indices
accordingly.

Shape inference following the transformation is fixed by deleting the
annotations instead of propagating them incorrectly. Deleting the shape
annotations should not hurt, as these are redone by running shape
inference after each transformation anyways.
---
 .../qonnx/fold_quant_weights.py               | 35 +++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py
index 0f6cbacb82..59ebe4eea3 100644
--- a/src/finn/transformation/qonnx/fold_quant_weights.py
+++ b/src/finn/transformation/qonnx/fold_quant_weights.py
@@ -149,7 +149,8 @@ def apply(self, model):
                         mul_tensor = helper.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
                             TensorProto.FLOAT,
-                            mul_shape,
+                            mul_shape,  # Note: This shape is known exactly as
+                            # it is an initializer with known shape
                         )
                         graph.value_info.append(mul_tensor)
                         model.set_initializer(mul_tensor.name, scale)
@@ -168,7 +169,9 @@ def apply(self, model):
                         act_mul_tensor = helper.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
                             TensorProto.FLOAT,
-                            output_shape,
+                            None,  # Note: Explicitly delete the shape
+                            # annotation to be redone by the next shape
+                            # inference
                         )
                         graph.value_info.append(act_mul_tensor)
                         successor.output[0] = act_mul_tensor.name
@@ -186,19 +189,37 @@ def apply(self, model):
                             div_tensor = helper.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
                                 TensorProto.FLOAT,
-                                mul_shape,
+                                None,  # Note: Explicitly delete the shape
+                                # annotation to be redone by the next shape
+                                # inference
                             )
                             graph.value_info.append(div_tensor)
                             model.set_initializer(div_tensor.name, scale)
 
-                            succ_input_name = successor.input[0]
+                            # Detect which input of the add-like successor is
+                            # fed by the quantizer node to select the other
+                            # branch to insert the scale factor
+                            if successor.input[0] == node_out:
+                                succ_input_name = successor.input[1]
+                            else:
+                                succ_input_name = successor.input[0]
+
                             act_mul_tensor = helper.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
                                 TensorProto.FLOAT,
-                                output_shape,
+                                None,  # Note: Explicitly delete the shape
+                                # annotation to be redone by the next shape
+                                # inference
                             )
                             graph.value_info.append(act_mul_tensor)
-                            successor.input[0] = act_mul_tensor.name
+
+                            # Detect which input of the add-like successor is
+                            # fed by the quantizer node to select the other
+                            # branch to insert the scale factor
+                            if successor.input[0] == node_out:
+                                successor.input[1] = act_mul_tensor.name
+                            else:
+                                successor.input[0] = act_mul_tensor.name
 
                             div_node = helper.make_node(
                                 "Div",
@@ -210,6 +231,8 @@ def apply(self, model):
                     # remove old node
                     graph.node.remove(n)
                     graph_modified = True
+                    # Note: Running shape inference is necessary as shape
+                    # annotations have been deleted above
                     model = model.transform(InferShapes())
                     return (model, graph_modified)
         return (model, graph_modified)

From 8691d3f0593988df7103e10c2eb26d1fbbb1cc79 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Mon, 20 Nov 2023 15:10:03 +0100
Subject: [PATCH 03/26] Make quantized activation handlers data layout aware

This probably is still rather sketchy, but at least it tries to check
the data layout annotation. For now seems to be enough for getting the
thresholds of multi-head attention right, IF qonnx properly annotates
the 3D layouts.
---
 .../qonnx/qonnx_activation_handlers.py        | 59 +++++++++++++++----
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 323e391df4..451ba52c29 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -25,8 +25,8 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 import numpy as np
+import warnings
 from abc import ABC, abstractmethod
 from onnx import TensorProto, helper
 from qonnx.core.modelwrapper import ModelWrapper
@@ -70,7 +70,7 @@ def _check_compatibility(self):
     @abstractmethod
     def _calculate_act_bias(self):
         """Calculate the activation bias,
-        which is introduced as an Add node behind the MultiTrheshold node.
+        which is introduced as an Add node behind the MultiThreshold node.
         """
         raise NotImplementedError()
 
@@ -82,7 +82,7 @@ def _calculate_thresholds(self):
     @abstractmethod
     def _calculate_act_scale(self):
         """Calculate the activation scale,
-        which is indroduced as a Mul node behind the Add node
+        which is introduced as a Mul node behind the Add node
         for the activation bias.
         """
         raise NotImplementedError()
@@ -157,7 +157,7 @@ def replace_quant_node(self):
         # Set scale and bias
         # If these values are scalar then they can be set as attributes
         # of the MultiThreshold node, if not they get inserted as adder and mul nodes
-        # behind the MultiTrheshold nodes.
+        # behind the MultiThreshold nodes.
         bias_scalar = adder_bias.shape == (1,) or len(adder_bias.shape) == 0
         scale_scalar = mul_scale.shape == (1,) or len(mul_scale.shape) == 0
         if scale_scalar and bias_scalar and self._q_node.op_type == "BipolarQuant":
@@ -355,7 +355,7 @@ def _calculate_thresholds(self):
         act_node = self._model.find_direct_predecessors(self._q_node)
         act_node = act_node[0]
         if act_node.op_type == "Relu":
-            # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
+            # Calculate thresholds, see: https://github.com/Xilinx/brevitas/blob/
             # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
             # onnx/finn/handler/act.py#L21
             num_distinct_values = 2**bit_width
@@ -395,8 +395,27 @@ def _calculate_thresholds(self):
                     else:
                         thresholds[c][t] = step / selu_scale
 
+        # First try to consider the tensor layout of the output for determining
+        # the number of output channels
+        layout = self._model.get_tensor_layout(self._q_node.output[0])
+        # If there is a layout annotation, use this to determine the index of
+        # the channel dimension
+        if layout is not None and "C" in layout:
+            # Lookup the index in list
+            cdim = layout.index("C")
+        # If no layout has been annotated or there is no channel dimension, fall
+        # back to the previous default assumption
+        else:
+            # Assume the channels to be in axis 1
+            cdim = 1
+            # Issue a warning to the user, so they are aware of this
+            warnings.warn(
+                f"No layout annotations for {self._q_node.output[0]}:"
+                f" Assuming channel dimension at index {cdim}"
+            )
+
         # ToDo: The index 1 needs to be changed to -1 for the channels last format
-        num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
+        num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
         final_shape = (num_output_channels, num_thresholds)
         if thresholds.shape != final_shape:
             thresholds = np.broadcast_to(thresholds, final_shape)
@@ -417,12 +436,12 @@ def _remove_activation_node(self, multi_threshold_node):
         act_node = self._model.find_direct_predecessors(self._q_node)
         if act_node is None:
             raise RuntimeError(
-                "For handling of Relu activations a predecesor to " "the Quant node must exist."
+                "For handling of Relu activations a predecessor to " "the Quant node must exist."
             )
         act_node = act_node[0]
         if act_node.op_type not in self.valid_predecessor_op_types():
             raise RuntimeError(
-                "The predecesor of the Quant node must be Relu or Selu for handling "
+                "The predecessor of the Quant node must be Relu or Selu for handling "
                 "of activations."
             )
 
@@ -509,7 +528,7 @@ def _calculate_thresholds(self):
         else:
             raise RuntimeError("Got an unexpected quantizer node type")
 
-        # Calculate thersholds, see: https://github.com/Xilinx/brevitas/
+        # Calculate thresholds, see: https://github.com/Xilinx/brevitas/
         # blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/
         # export/onnx/finn/handler/act.py#L76
         if bit_width == 1.0:
@@ -537,8 +556,28 @@ def _calculate_thresholds(self):
                 for t in range(num_thresholds):
                     thresholds[c][t] = min_threshold[c] + step[c] * t
 
+            # First try to consider the tensor layout of the output for
+            # determining the number of output channels
+            layout = self._model.get_tensor_layout(self._q_node.output[0])
+            # If there is a layout annotation, use this to determine the index
+            # of the channel dimension
+            if layout is not None and "C" in layout:
+                # Lookup the index in list
+                cdim = layout.index("C")
+            # If no layout has been annotated or there is no channel dimension,
+            # fall back to the previous default assumption
+            else:
+                # Assume the channels to be in axis 1
+                cdim = 1
+                # Issue a warning to the user, so they are aware of this
+                warnings.warn(
+                    f"No layout annotations for {self._q_node.output[0]}:"
+                    f" Assuming channel dimension at index {cdim}"
+                )
+
             # ToDo: The index 1 needs to be changed to -1 for the channels last format
-            num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1]
+            num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
+
             final_shape = (num_output_channels, num_thresholds)
             if thresholds.shape != final_shape:
                 thresholds = np.broadcast_to(thresholds, final_shape)

From 24acc18dcb24aea641a3947eec231551a7666266 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Mon, 13 Nov 2023 15:58:57 +0100
Subject: [PATCH 04/26] [Streamline] Fix AbsorbAddIntoMultiThreshold assumed
 input order

Add is commutative and thus the export does not always generate the
initializer as the second input. However, this was always assumed by
this transformation, failing via assertion if the inputs were simply
ordered differently. The transformation now handles both of the two
possible input orderings.
---
 src/finn/transformation/streamline/absorb.py | 45 ++++++++++++++++----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index e3e2468bba..59605ca3fa 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -30,6 +30,10 @@
 import qonnx.core.data_layout as DataLayout
 import warnings
 from onnx import helper as oh
+# Protobuf onnx graph node type
+from onnx import NodeProto  # noqa
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -100,6 +104,23 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+# Groups inputs by categories, i.e., groups dynamic inputs first, followed by
+# initializers. Keeps order of inputs in each category.
+def group_inputs_by_category(node: NodeProto, model: ModelWrapper):  # noqa
+    # First select all dynamic inputs, which are those without initializer
+    # tensor
+    dynamics = [
+        i for i in node.input if model.get_initializer(i) is None
+    ]
+    # Select all input which are initializers, which, by exclusion, are all
+    # those not among the dynamic inputs
+    initializers = [
+        i for i in node.input if i not in dynamics
+    ]
+    # Return lists of dynamic anc initializer inputs
+    return dynamics, initializers
+
+
 class AbsorbAddIntoMultiThreshold(Transformation):
     """Absorb preceding Add ops into MultiThreshold by updating the threshold
     values. Only scalar/1D add vectors can be absorbed."""
@@ -113,13 +134,19 @@ def apply(self, model):
             if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n):
                 consumer = model.find_consumer(n.output[0])
                 if consumer is not None and consumer.op_type == "MultiThreshold":
-                    add_weight_name = n.input[1]
-                    threshold_name = consumer.input[1]
-                    A = model.get_initializer(add_weight_name)
-                    T = model.get_initializer(threshold_name)
-                    assert A is not None, "Initializer for add weights is not set."
+                    # As Add is not a join node, there must be one initializer
+                    # and one dynamic input. We do not know their order, but
+                    # can group them accordingly to extract the tensor names
+                    (start,), (add_weight, ) = group_inputs_by_category(
+                        n, model
+                    )
+                    threshold = consumer.input[1]
+                    A = model.get_initializer(add_weight)
+                    T = model.get_initializer(threshold)
+                    # Test for the thresholds actually being initializers
+                    # Note: No need to validate the add_weights anymore, this
+                    # is already handled by the grouping and is_join_node test.
                     assert T is not None, "Initializer for thresholds is not set."
-                    start_name = n.input[0]
                     # we can only absorb 0d or 1d adds
                     is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape)
                     actual_ndims = len(tuple(filter(lambda x: x > 1, A.shape)))
@@ -128,13 +155,13 @@ def apply(self, model):
                         Tnew = T - A.reshape(-1, 1)
                         # Tnew = T - A.reshape(-1, T.shape[1])
                         # compute new thresholds and set initializer
-                        model.set_initializer(threshold_name, Tnew)
+                        model.set_initializer(threshold, Tnew)
                         # wire add input directly to MultiThreshold
-                        consumer.input[0] = start_name
+                        consumer.input[0] = start
                         # remove the add node
                         graph.node.remove(n)
                         graph_modified = True
-        return (model, graph_modified)
+        return model, graph_modified
 
 
 class AbsorbMulIntoMultiThreshold(Transformation):

From e632328bed4a69aa1a658882529648c7281121a7 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 13 Mar 2024 10:17:08 +0100
Subject: [PATCH 05/26] Fix clipping range issue in RoundAndClipThresholds
 transformation

---
 src/finn/transformation/streamline/round_thresholds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 5ba5ee0ff5..2bf3630cff 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -57,10 +57,10 @@ def apply(self, model):
                     model.set_tensor_datatype(n.input[1], idtype)
                     graph_modified = True
                 if idtype.is_integer() and (
-                    (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any()
+                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
                 ):
                     # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
+                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
                     model.set_initializer(n.input[1], Tnew)
                     # use same datatype as inputs for thresholds
                     model.set_tensor_datatype(n.input[1], idtype)

From 8dd85f4da06bc25d5c676970a62a12aebabb2e18 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:06:03 +0200
Subject: [PATCH 06/26] Rework RoundAndClipThresholds to avoid range and type
 promotion issues

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/round_thresholds.py            | 105 +++++++++++++-----
 1 file changed, 76 insertions(+), 29 deletions(-)

diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 2bf3630cff..2666242730 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -26,43 +26,90 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Need numpy for modifying the onnx graph tensors, which are numpy style arrays
 import numpy as np
+
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# QONNX graph transformation base class
 from qonnx.transformation.base import Transformation
 
+# Transformation running qonnx datatype inference
+from qonnx.transformation.infer_datatypes import InferDataTypes
+
 
+# Rounds and clips thresholds to integer values if the node inputs are integer,
+# respecting range, representability and data type (promotion) of the container
+# data type
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
-    is unsigned, sets negative thresholds to zero."""
+    is unsigned, sets negative thresholds to zero. Type-casts thresholds (back)
+    to the float32 container type (this is separate from the quantization
+    annotation). Runs InferDataTypes() afterward to propagate any changes to the
+    quantization data types."""
 
-    def apply(self, model):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
         graph = model.graph
+        # Keep track of whether the graph has been modified
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "MultiThreshold":
-                idtype = model.get_tensor_datatype(n.input[0])
-                T = model.get_initializer(n.input[1])
-                Tnew = np.ceil(T)
-                if idtype.is_integer() and (T != Tnew).any():
-                    # round up the thresholds to nearest integer
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any():
-                    # clip any negative thresholds if input is unsigned
-                    Tnew = np.clip(Tnew, 0, None)
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and (
-                    (Tnew < (idtype.min())).any() or (Tnew > (idtype.max())).any()
-                ):
-                    # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min(), idtype.max())
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to initializer tensors of MultiThreshold operations
+            if node.op_type == "MultiThreshold":
+                # Try to get the thresholds initializer tensor
+                thresholds = model.get_initializer(node.input[1])
+                # There might be no constant thresholds stored as initializer
+                # tensor inside the model
+                if thresholds is None:
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Get the data type of the inputs to this operation
+                dtype = model.get_tensor_datatype(node.input[0])
+                # This transformation only applies to thresholding operations
+                # operating on integer inputs
+                if not dtype.is_integer():
+                    # Nothing we can do, skip to the next node
+                    continue
+                # Round thresholds up to nearest integer and clip thresholds
+                # outside the input range
+                #   Note: This might promote the thresholds to float64 and
+                #   introduce extra inaccuracies due to large integers not being
+                #   exactly representable in floating-point representation.
+                #   See for example: np.ceil(np.float32(16777217)) == 16777216
+                # fmt: off
+                new_thresholds = np.clip(
+                    np.ceil(thresholds), dtype.min(), dtype.max()
+                )
+                # fmt: on
+                # Convert back to the preferred float32 container type
+                #   Note: np.clip might have promoted the thresholds to float64
+                #   TODO: Maybe consider an int64 container type for thresholds
+                #    rounded to integer? Need to check all other transformations
+                #    and code generation through the whole FINN and QONNX stack
+                #    first, as these probably assume a float32 container type.
+                new_thresholds = new_thresholds.astype(np.float32)
+                # Insert the rounded and clipped thresholds back into the model
+                model.set_initializer(node.input[1], new_thresholds)
+                # The rounded and clipped thresholds now fit into the input data
+                # type
+                model.set_tensor_datatype(node.input[1], dtype)
+                # Test whether the new thresholds actually differ from the old
+                # ones
+                if np.any(new_thresholds != thresholds):
+                    # Track the graph has been modified to inform the transform
+                    # container to exhaustively repeat this transformation until
+                    # no changes are possible
                     graph_modified = True
-        return (model, graph_modified)
+                    # Immediately exit here to propagate the data type changes
+                    # before considering the next node
+                    break
+        # Some data types might have changed, do one pass of data type inference
+        # to propagate these changes through the graph
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the graph actually
+        # has been transformed to exhaustively apply this transformation again.
+        return model, graph_modified

From 8b7c2eb0b04bb6689361d65b7a4de493a62284de Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 17 Apr 2024 10:51:26 +0200
Subject: [PATCH 07/26] [Thresholding] Make sure the output of python
 simulation is float32

Note: This applies to the "container" type, not the simulated
quantization type. This is to prevent accidental promotion to float64.
---
 src/finn/custom_op/fpgadataflow/thresholding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index dde813a293..363c1572cf 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -259,7 +259,7 @@ def execute_node(self, context, graph):
         else:
             # signed offset
             y += act.min()
-        context[node.output[0]] = y
+        context[node.output[0]] = y.astype(np.float32)
 
     def calc_tmem(self):
         """Calculates and returns TMEM."""

From f01d02fdb5706e48c238ca27e136b7cbe839ff29 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Sat, 6 Apr 2024 17:10:36 +0200
Subject: [PATCH 08/26] [Tests] Rework test-cases for reworked
 RoundAndClipThresholds

See https://github.com/Xilinx/finn/issues/978
---
 .../streamline/test_round_thresholds.py       | 257 ++++++++++++++++--
 1 file changed, 227 insertions(+), 30 deletions(-)

diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 85c60b37d5..63375598a0 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -26,45 +26,242 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# fmt: off
+# Disable formatter. This is deliberately formatted to stay within 80 characters
+# per line. Black, however, formats some lines going beyond this.
+
+# Testing framework
 import pytest
 
+# Use numpy for python execution / computing the ground truth expected values
 import numpy as np
+
+# Utility types and function for creating onnx nodes and graphs
 from onnx import TensorProto, helper
+
+# QONNX data types like INT25
 from qonnx.core.datatype import DataType
+
+# QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import qonnx_make_model
 
+# Generate random tensors of QONNX/FINN data types for testing
+from qonnx.util.basic import gen_finn_dt_tensor
+
+# Execution of onnx graphs within FINN
 import finn.core.onnx_exec as oxe
+
+# The transformation to be tested
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
-@pytest.mark.streamline
-def test_round_thresholds():
-    v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4])
-    thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
-    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
-    node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. Without proper rounding,
+# this tests only the clipping, range and type-casting behavior of the
+# transformation.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["INT25"]  # Note: Matches configuration above
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Generate sorted thresholds for each of the input channels
+    thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
+    )
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the pure integer test-case
+    # and no actual rounding should happen, thus the rounded operation should
+    # produce outputs exactly equal.
+    assert np.all(out_produced == out_expected)
+
+
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. This test case tests actual
+# rounding of floating-point thresholds.
+@pytest.mark.parametrize("i_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Below 24-bit thresholds we will not observe any interesting rounding
+    #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+    #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+    #    generate test inputs slightly above and below this.
+    # 2. We want to test out-of-range clipping of thresholds, in particular
+    #    clipping of the negative portion of signed thresholds. Thus, we only
+    #    generate signed thresholds, but test with signed and unsigned
+    #    inputs of smaller, larger and equal range.
+    # 3. Testing proper floating-point thresholds requires a separate test-case
+    "INT23", "UINT23", "INT24", "UINT24", "INT25", "UINT25", "INT26", "UINT26"
+])
+@pytest.mark.parametrize("o_dtype", [
+    # Explanation for selecting these test configurations:
+    # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+    #    inputs and thresholds.
+    # 2. However, with randomly samples thresholds from a rather large range due
+    #    to the selected input bit-widths (see above), we risk not adequately
+    #    covering the input range if we sample too few thresholds. The number of
+    #    thresholds sampled depends on the bit-width of the output, thus we use
+    #    rather high bit-width for testing.
+    # 3. For a "real" model, the quantization procedure *should* take care of
+    #    adequately covering the true input range.
+    "INT8", "UINT8"
+])
+@pytest.mark.parametrize("n_elems", [
+    # Explanation for selecting these test configurations:
+    # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+    # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+    1, 2, 3, 4, 256
+])
+def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
+    # Convert string representation of data type to onnx DataType
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["FLOAT32"]
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    # Create a dummy MultiThreshold operation to be tested
+    node = helper.make_node(
+        # Op-Type of the node
+        "MultiThreshold",
+        # MultiThreshold is implemented under the qonnx domain
+        domain="qonnx.custom_op.general",
+        # List the names of the input tensors
+        inputs=["inp", "thresholds"],
+        # List the names of the output tensors
+        outputs=["out"],
+        # The CustomOp needs to know the data type of the output to be produced
+        out_dtype=str(o_dtype)
+    )
+    # Number of threshold values required to produce outputs of type o_dtype
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    # Create tensor value infos for all input/output tensors involved
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    # Create a tensor value info for the thresholds parameter tensor
+    #   Note: Number of thresholds is determined by the output data type
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    # Combine node and tensor value infos into an onnx graph
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    # Wrap the model graph in a ModelWrapper container
+    model = ModelWrapper(helper.make_model(graph))
+    # Sample random tensors of the configured input data type
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Draw uniformly random prototype thresholds in [0,+1] range
+    thresholds = np.random.rand(n_elems, n_thresholds)
+    # Type alias to 25-bit signed integer type used to set the range of the
+    # thresholds
+    INT25 = DataType["INT25"]  # noqa: Variable name not lowercase
+    # Map the prototype thresholds into the test integer range and sort
+    thresholds = np.sort((INT25.max() - INT25.min()) * thresholds + INT25.min())
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    # Set the thresholds as initializer input to the model
+    model.set_initializer("thresholds", thresholds)
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+    # Run the transformation to be tested
+    model = model.transform(RoundAndClipThresholds())
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    assert model.get_tensor_datatype("thresholds") == i_dtype
+    assert model.get_tensor_datatype("out") == o_dtype
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(
+        x.is_integer() for x in model.get_initializer("thresholds").flatten()
     )
-    graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = qonnx_make_model(graph_def)
-    model = ModelWrapper(model_def)
-    threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
-    model.set_initializer("thresholds", threshold_val)
-    model.set_tensor_datatype("v", DataType["INT8"])
-    inp_dict_f = {"v": np.floor(threshold_val).T}
-    inp_dict_n = {"v": np.round(threshold_val).T}
-    inp_dict_c = {"v": np.ceil(threshold_val).T}
-    orig_f = oxe.execute_onnx(model, inp_dict_f)["out"]
-    orig_n = oxe.execute_onnx(model, inp_dict_n)["out"]
-    orig_c = oxe.execute_onnx(model, inp_dict_c)["out"]
-    assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"]
-    new_model = model.transform(RoundAndClipThresholds())
-    # rounded up thresholds should have same dtype as input
-    assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"]
-    new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"]
-    new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"]
-    new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"]
-    assert np.isclose(orig_f, new_f, atol=1e-3).all()
-    assert np.isclose(orig_n, new_n, atol=1e-3).all()
-    assert np.isclose(orig_c, new_c, atol=1e-3).all()
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Compare the results before and after: This is the floating-point test with
+    # actual rounding, this the transformed result may only be equal within some
+    # tolerance.
+    # Hm, never observed this to be relevant. For all test configurations, exact
+    # equality seems to hold, probably due to only integer inputs being tested.
+    assert np.allclose(out_produced, out_expected, atol=1.0e-3)

From 023d95061da88db87c2257dde2ca2dd0b4982561 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 17 Apr 2024 11:47:43 +0200
Subject: [PATCH 09/26] [Streamline] Check validity of broadcasting Add into
 MultiThreshold

Up until now, this was not a problem, as QONNX and FINN assumed all
tensors to be either broadcasted offline, or, if not, be "trivially"
boradcastable, like scalars or effectively scalar tensors. With the
introduction of proper multidirectional broadcasting for elementwise
binary operations, this might not be the case anymore and we need to
explicitly reject these from being absorbed into multi-thresholds, if
broadcasting is not possible (otherwise, without testing, this
transformation just fails with some numpy exception).
---
 src/finn/transformation/streamline/absorb.py | 41 +++++++++++++++-----
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index 59605ca3fa..fc09d2dedd 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -151,16 +151,39 @@ def apply(self, model):
                     is_scalar = A.ndim == 0 or all(x == 1 for x in A.shape)
                     actual_ndims = len(tuple(filter(lambda x: x > 1, A.shape)))
                     is_1d = actual_ndims == 1
+
+                    def can_broadcast_shapes(lhs, rhs):
+                        # Broadcasting might raise an exception
+                        try:
+                            # Try broadcasting the shapes
+                            if len(np.broadcast_shapes(lhs, rhs)) == 2:
+                                # These tensors can be broadcast, preserving the
+                                # left-hand-side shape
+                                return True
+                            # These tensors cannot be broadcast
+                            return False
+                        # Failing to broadcast the tensors raises ValueError
+                        except ValueError:
+                            # These tensors cannot be broadcast
+                            return False
+
                     if is_scalar or is_1d:
-                        Tnew = T - A.reshape(-1, 1)
-                        # Tnew = T - A.reshape(-1, T.shape[1])
-                        # compute new thresholds and set initializer
-                        model.set_initializer(threshold, Tnew)
-                        # wire add input directly to MultiThreshold
-                        consumer.input[0] = start
-                        # remove the add node
-                        graph.node.remove(n)
-                        graph_modified = True
+                        # Reshape addition parameters to have the elements/PE
+                        # dimension first, aligned with the thresholds.
+                        A = A.reshape(-1, 1)  # noqa: Not lowercase
+                        # Check that we can actually broadcast the addition
+                        # weights to the thresholds tensors, i.e., it is adding
+                        # along the right axis
+                        if can_broadcast_shapes(T.shape, A.shape):
+                            Tnew = T - A  # noqa: Not lowercase
+                            # Tnew = T - A.reshape(-1, T.shape[1])
+                            # compute new thresholds and set initializer
+                            model.set_initializer(threshold, Tnew)
+                            # wire add input directly to MultiThreshold
+                            consumer.input[0] = start
+                            # remove the add node
+                            graph.node.remove(n)
+                            graph_modified = True
         return model, graph_modified
 
 

From 945db12d20e4e03bae305a5959cd6f54931b36f3 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 17 Apr 2024 11:53:52 +0200
Subject: [PATCH 10/26] [Streamline] Fix backwards-propagating shapes in
 MoveAddPastMul

Shapes propagating backwards in graph transformations can break
subsequent shape inference. In particular, this is the case for
operators involving broadcasting semantics, where the output shape
cannot be fully reflected in the input shapes, i.e., even for
elementwise operations, the input shape might not be identical to the
output shape. This is fixed be deleting problematic shape annotations to
be re-done immediately.
---
 src/finn/transformation/streamline/reorder.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8ac2d7dad6..e471eff441 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -91,13 +91,21 @@ def apply(self, model):
                     graph.node.insert(node_ind + 1, new_add)
                     # replace add value
                     model.set_initializer(add_weight_name, BA)
+                    # Delete the shape annotation of the connecting tensors
+                    # to be re-done later. This prevents shapes from propagating
+                    # backwards.
+                    # Note: Do not delete annotation for the input tensor, as
+                    # this prevents future shape inference.
+                    model.set_tensor_shape(middle_name, None)
+                    model.set_tensor_shape(end_name, None)
                     # remove old nodes
                     graph.node.remove(n)
                     graph.node.remove(consumer)
                     graph_modified = True
-
+        # Note: Running shape inference is necessary as shape
+        # annotations have been deleted above
         model = model.transform(InferShapes())
-        return (model, graph_modified)
+        return model, graph_modified
 
 
 class MoveScalarMulPastMatMul(Transformation):

From 3f13673e88640efa9c10ba6020f7ca6b1e04766c Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Thu, 18 Apr 2024 17:53:59 +0200
Subject: [PATCH 11/26] [Elementwise] Add InferElementwiseBinaryOperation
 transformation

---
 .../fpgadataflow/elementwise_binary.py        |   2 +-
 .../fpgadataflow/convert_to_hw_layers.py      | 138 +++++++++++++++++-
 2 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index 34b788761c..0aee0959bd 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -82,7 +82,7 @@ def get_nodeattr_types(self):
             #   Note: Might be inferred from the context
             "rhs_style": ("s", False, "input", {"input", "const"}),
             # Number of elements in the last dimensions processed in parallel
-            "PE": ("i", True, 1),
+            "PE": ("i", False, 1),
             # Possible execution modes for simulating this node
             #   Note: Override to support python mode
             "exec_mode": (
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 897d714bf8..7aa28999de 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -30,8 +30,11 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
-from onnx import TensorProto, helper
+from onnx import NodeProto, TensorProto, helper
 from qonnx.core.datatype import DataType
+
+# QONNX wrapper to ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import SortGraph
@@ -40,6 +43,12 @@
 from qonnx.util.basic import get_by_name
 from qonnx.util.onnx import nchw_to_nhwc
 
+# Module containing specializations of elementwise binary operations
+import finn.custom_op.fpgadataflow.elementwise_binary as elementwise_binary
+
+# Base class for all FINN custom ops, here just used for type-hinting
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
 
 class InferConvInpGen(Transformation):
     """Convert Im2Col layers to ConvolutionInputGenerator layers."""
@@ -1691,3 +1700,130 @@ def apply(self, model):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+# Lifts scalar to rank-1 tensor
+def lift_to_rank1(name: str, model: ModelWrapper):
+    # Scalars have a shape of lengths zero
+    if len(model.get_tensor_shape(name)) == 0:
+        # Lift shape to rank-1 tensor with single element
+        model.set_tensor_shape(name, [1])
+        # Check whether this tensor has an initializer
+        if (tensor := model.get_initializer(name)) is not None:
+            # Set new initializer tensor of shape [1]
+            model.set_initializer(name, tensor.reshape(1))
+
+
+# Converts supported elementwise binary operations to their FINN custom
+# operation
+class InferElementwiseBinaryOperation(Transformation):
+    # Filter function to filter out the last elementwise Mul operation,
+    # typically corresponding to output de-quantization, which should happen
+    # off-chip
+    @staticmethod
+    def reject_output_dequant(model: ModelWrapper, node: NodeProto):
+        # The operator must be a Mul and have no successor nodes
+        if node.op_type == "Mul" and not model.find_direct_successors(node):
+            # If the output is a floating-point tensors, reject this
+            if model.get_tensor_datatype(node.output[0]) == "FLOAT32":
+                # Filter False rejects this node
+                return False
+        # Filter True accepts this node
+        return True
+
+    # Filter function to filter out any operation involving any floating-point
+    # tensor
+    @staticmethod
+    def reject_floats(model: ModelWrapper, node: NodeProto):
+        # Check for any input being floating-point
+        if any(model.get_tensor_datatype(x) == "FLOAT32" for x in node.input):
+            # Filter False rejects this node
+            return False
+        # Check for any output being floating-point
+        if any(model.get_tensor_datatype(x) == "FLOAT32" for x in node.output):
+            # Filter False rejects this node
+            return False
+        # Filter True accepts this node
+        return True
+
+    # Initializes the transformation method with an optional filter function
+    def __init__(self, _filter=None):
+        # Initialize the base class Transformation object
+        super().__init__()
+        # Register the filter function as attribute
+        self._filter = _filter if _filter is not None else lambda *_: True
+
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Skip transforming nodes rejected by the filter
+            if not self._filter(model, node):
+                continue
+            # If a custom operation with corresponding name is implemented in
+            # the module, this operator is supported for conversion
+            if f"Elementwise{node.op_type}" in dir(elementwise_binary):
+                # Transplant this operator into our FINN domain
+                node.domain = "finn.custom_op.fpgadataflow"
+                # Adapt the op-type prefixing it with Elementwise
+                # TODO: Consider dropping the prefix?
+                node.op_type = f"Elementwise{node.op_type}"
+                # Now we can get the CustomOp wrapper instance providing easier
+                # attribute access
+                inst: HWCustomOp = getCustomOp(node)
+                # Set the backend attribute to mark this an operation supported
+                # to be implemented on an FPGA by FINN
+                inst.set_nodeattr("backend", "fpgadataflow")
+                # Need to "lift" potential scalar inputs to rank-1 tensors
+                lift_to_rank1(node.input[0], model)
+                lift_to_rank1(node.input[1], model)
+
+                # fmt: off
+                # Disable formatter. This is deliberately formatted to stay
+                # within 80 characters per line. Black, however, formats some
+                # lines going beyond this.
+
+                # Insert data type attributes from "context" into the CustomOp
+                # node
+                # TODO: Find a way to handle this via data type inference?
+                inst.set_nodeattr(
+                    "lhs_dtype", str(model.get_tensor_datatype(node.input[0]))
+                )
+                inst.set_nodeattr(
+                    "rhs_dtype", str(model.get_tensor_datatype(node.input[1]))
+                )
+                inst.set_nodeattr(
+                    "out_dtype", str(model.get_tensor_datatype(node.output[0]))
+                )
+                # Insert shape attributes from "context" into the CustomOp node
+                # TODO: Find a way to handle this via shape inference?
+                inst.set_nodeattr(
+                    "lhs_shape", model.get_tensor_shape(node.input[0])
+                )
+                inst.set_nodeattr(
+                    "rhs_shape", model.get_tensor_shape(node.input[1])
+                )
+                inst.set_nodeattr(
+                    "out_shape", model.get_tensor_shape(node.output[0])
+                )
+
+                # fmt: on
+
+                # Consider the graph to be modified, triggering exhaustive
+                # re-application of this transformation
+                graph_modified = True
+                # Exiting here triggers type and shape inference and cleanup
+                # after each transformed node. This helps QONNX to behave
+                # better / more consistent in certain cases...
+                break
+        # Re-do shape and data type annotations after potential changes to the
+        # model graph
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the graph actually
+        # has been transformed
+        return model, graph_modified

From 6a6616a01e9331b788362fc4676ae327107bba82 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Thu, 18 Apr 2024 17:56:42 +0200
Subject: [PATCH 12/26] [Tests] Add simple integration test for
 ElementwiseBinaryOperation

The new test case tests export, streamlining, conversion to hardware
layers and subsequent Python, C++ and RTL simulation of QuantEltwiseAdd
from Brevitas, serving as a representative example of an elementwise
binary operation.
---
 tests/fpgadataflow/test_elementwise_binary.py | 226 +++++++++++++++++-
 1 file changed, 219 insertions(+), 7 deletions(-)

diff --git a/tests/fpgadataflow/test_elementwise_binary.py b/tests/fpgadataflow/test_elementwise_binary.py
index 5322c15069..57ace2b72f 100644
--- a/tests/fpgadataflow/test_elementwise_binary.py
+++ b/tests/fpgadataflow/test_elementwise_binary.py
@@ -8,6 +8,19 @@
 # Numpy math and arrays
 import numpy as np
 
+# Create temporary files automatically deleted after integration test
+import tempfile
+
+# PyTorch required for integration test
+import torch
+
+# Export brevitas models to QONNX representation in integration test
+from brevitas.export import export_qonnx
+
+# Test the quantized elementwise addition operation from brevitas in integration
+# test: this one should be representative enough for the operator pattern
+from brevitas.nn import QuantEltwiseAdd
+
 # ONNX graph and tensor utility
 from onnx import TensorProto
 from onnx import helper as oh
@@ -24,8 +37,18 @@
 # Registry of all QONNX CustomOps
 from qonnx.custom_op.registry import getCustomOp
 
-# Graph transformation giving unique names to each node in a QONNX model graph
-from qonnx.transformation.general import GiveUniqueNodeNames
+# Cleanup transformations required after QONNX model import
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+    RemoveUnusedTensors,
+)
+
+# Adds data layout annotations to the model graph to correctly convert
+# quantizers to multi-thresholds
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
 
 # QONNX graph transformations for inferring datatypes and shapes
 from qonnx.transformation.infer_datatypes import InferDataTypes
@@ -36,6 +59,15 @@
 
 # FINN graph transformations for preparing simulation (cppsim or rtlsim)
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+
+# Mapping to hardware operators of the two operations relevant for the
+# integration test
+# Note: The integration test serves as the test-case for
+# InferElementwiseBinaryOperation
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferElementwiseBinaryOperation,
+    InferThresholdingLayer,
+)
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
@@ -43,16 +75,32 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
+# Converts between QONNX and FINN dialect of ONNX representation
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+
+# Standard set of streamlining transformations delivered with FINN
+from finn.transformation.streamline import Streamline
+
+# Specific streamlining transformations which needs to be applied manually in
+# integration test
+from finn.transformation.streamline.absorb import AbsorbMulIntoMultiThreshold
+
+# Checks whether a node is a fpgadataflow backend node handled by FINN
+from finn.util.fpgadataflow import is_fpgadataflow_node
+
 
 # Specializes all nodes to be implemented as HLS backend
 def specialize_hls(model: ModelWrapper):
     # Mark all nodes to be specialized as HLS backend implementations
     for node in model.graph.node:  # noqa: Duplicate test setup code
-        # Get the CustomOp instance of the node to get access to the node
-        # attributes
-        inst = getCustomOp(node)
-        # Note: only HLS-based layers execute C++ Simulation
-        inst.set_nodeattr("preferred_impl_style", "hls")
+        # Skip non-fpgadataflow backend operators as these do not have the
+        # preferred_impl_style attribute
+        if is_fpgadataflow_node(node):
+            # Get the CustomOp instance of the node to get access to the node
+            # attributes
+            inst = getCustomOp(node)
+            # Note: only HLS-based layers execute C++ Simulation
+            inst.set_nodeattr("preferred_impl_style", "hls")
     # Turn all HWCustomOp layers into HLS specializations
     return model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
 
@@ -361,3 +409,167 @@ def test_elementwise_binary_operation_rtlsim(
 
     # Compare the expected to the produced for exact equality
     assert np.all(o_produced == o_expected)
+
+
+# Test-case setting up a complete dummy model containing various elementwise
+# binary operations in PyTorch, converting to QONNX and verifying in Python, C++
+# and RTL simulation
+# Shape of the left-hand-side input
+# Note: Stripped down test of broadcasting semantics due to rather poor support
+# for arbitrary data layouts inf QONNX and FINN: Only 2d and 4d layouts (with
+# certain assumptions/restrictions) are really supported.
+# Note: Cannot test scalar shapes (or effectively scalar shapes like [1,1]), due
+# to streamlining integrating those into MultiThresholds (removing the operator
+# to be tested), leading to consecutive quantizers. Consecutive quantizers
+# should be avoided as  this sometimes can cause range and precision errors.
+@pytest.mark.parametrize("lhs_shape", [[32, 1]])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [[32, 16]])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [[], ["lhs"], ["rhs"]])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# This is a slow running fpgadataflow type of test which requires vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+def test_elementwise_binary_operation_integration_elementwise_add(
+        lhs_shape, rhs_shape, pe, initializers
+):
+    # PyTorch model wrapping the component(s) to be tested
+    class Dummy(torch.nn.Module):
+        # Sets up the test model and initializes parameters
+        def __init__(self):
+            # Initialize the PyTorch Module superclass
+            super().__init__()
+            # Elementwise addition component to be tested
+            self.add = QuantEltwiseAdd()
+            # Left- and right-hand-side input tensors in case these are set to
+            # be initializers
+            self.lhs = torch.randn(*lhs_shape)
+            self.rhs = torch.randn(*rhs_shape)
+
+        # Model forward pass taking multiple inputs as arguments
+        def forward(self, *xs):
+            # Depending on the test configuration, extract inputs to the add
+            # operation from model inputs of from model parameters
+            _lhs = self.lhs if "lhs" in initializers else xs[0]
+            _rhs = self.rhs if "rhs" in initializers else xs[1]
+            # Quantized elementwise addition of the two inputs
+            return self.add(_lhs, _rhs)
+
+    # Create the test instance of the dummy model
+    model = Dummy()
+    # Create dummy test inputs
+    lhs = torch.randn(*lhs_shape)
+    rhs = torch.randn(*rhs_shape)
+    # Do a forward pass with model in training mode to calibrate the quantizers
+    _ = model(lhs, rhs)
+    # Switch model to evaluation mode to keep parameters fixed for export
+    model = model.eval()
+    # Do not accumulate gradients while generating test output
+    with torch.no_grad():
+        # Model forward pass generating the expected output for verification
+        out_expected = model(lhs, rhs).numpy().astype(np.float32)
+    # Generate a temporary directory for running this test
+    with tempfile.TemporaryDirectory() as tmp:
+        # Export the model to ONNX format to be consumed by FINN
+        export_qonnx(model, (lhs, rhs), tmp + "/model.onnx")
+        # Wrap the model with QONNX wrapper for transformations
+        model = ModelWrapper(tmp + "/model.onnx")
+        # Cleanup transformations preparing the model to be consumed by FINN
+        model = model.transform(InferDataTypes())
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataLayouts())
+        model = model.transform(ConvertQONNXtoFINN())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveUniqueParameterTensors())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(RemoveUnusedTensors())
+        # Need to absorb scalar multiplication into the thresholding layer
+        # first, to prevent large rounding error due to moving these in front of
+        # add operations later.
+        model = model.transform(AbsorbMulIntoMultiThreshold())
+        # Do a single round of standard streamlining of the model graph
+        model = model.transform(Streamline())
+        # Convert layers to hardware custom operations
+        model = model.transform(InferThresholdingLayer())
+        model = model.transform(InferElementwiseBinaryOperation(
+            # We want to keep the output de-quantization off-chip
+            _filter=InferElementwiseBinaryOperation.reject_floats
+        ))
+
+        # Apply folding config to set the PE parallelism for hardware layers
+        model = model.transform(ApplyConfig({
+            "Defaults": {"PE": [pe, ["ElementwiseAdd", "Thresholding"]]}
+        }))
+
+        # Prepare the execution context with dummy data from above and input
+        # node names extracted from transformed modelo graph
+        context = {}
+
+        # Convert verification inputs to numpy format used by ONNX execution
+        lhs = lhs.numpy().astype(np.float32)
+        rhs = rhs.numpy().astype(np.float32)
+
+        # If the left-hand-side is not an initializer, it must be an input
+        # inserted into the execution context
+        if "lhs" not in initializers:
+            # Left-hand-side is always the first input
+            context[model.graph.input[0].name] = lhs
+
+        # If the right-hand-side is not an initializer, it must be an input
+        # inserted into the execution context
+        if "rhs" not in initializers:
+            # Index of the right-hand-side input depends on whether there is a
+            # left-hand-side input
+            rhs_index = int("lhs" not in initializers)
+            context[model.graph.input[rhs_index].name] = rhs
+
+        # Set model execution mode to python simulation
+        model = model.transform(SetExecMode("python"))
+        model = model.transform(GiveUniqueNodeNames())
+        # Execute the onnx model to collect the result
+        out_produced = execute_onnx(model, context)[model.graph.output[0].name]
+        # Compare the expected to the produced
+        # Note: Only test for close up to some tolerance as the modelo has
+        # streamlined, which may involve rounding
+        assert np.allclose(out_produced, out_expected, atol=1e-3), \
+            "Python simulation verification failed"
+
+        # Apply folding config to implement Thresholding layers in RTL mode
+        # Note: Must be done in RTL for now to avoid test failing due to
+        # PE-parallel stream being too wide for Vitis HLS.
+        model = model.transform(ApplyConfig({
+            "Defaults": {"preferred_impl_style": ["rtl", ["Thresholding"]]}
+        }))
+        # # Specializes all nodes to their backend implementation
+        model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
+
+        # Set model execution mode to C++ simulation
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        # Generates the C++ source and compiles the C++ simulation
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        # Execute the onnx model to collect the result
+        out_produced = execute_onnx(model, context)[model.graph.output[0].name]
+        # Compare the expected to the produced
+        # Note: Only test for close up to some tolerance as the modelo has
+        # streamlined, which may involve rounding
+        assert np.allclose(out_produced, out_expected, atol=1e-3), \
+            "C++ simulation verification failed"
+
+        # Set model execution mode to RTL simulation
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        # Generates the C++ source and compiles the RTL simulation
+        model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10))  # noqa
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+        # Execute the onnx model to collect the result
+        out_produced = execute_onnx(model, context)[model.graph.output[0].name]
+        # Compare the expected to the produced
+        # Note: Only test for close up to some tolerance as the modelo has
+        # streamlined, which may involve rounding
+        assert np.allclose(out_produced, out_expected, atol=1e-3), \
+            "RTL simulation verification failed"

From fd1aedd9525ada11c0c76ff6f930608de8692033 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 19 Apr 2024 13:03:45 +0200
Subject: [PATCH 13/26] [Elementwise] Some cleanup / simplification of
 generated code

---
 custom_hls/flatten.hpp                               |  2 +-
 .../fpgadataflow/hls/elementwise_binary_hls.py       | 12 +++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/custom_hls/flatten.hpp b/custom_hls/flatten.hpp
index 7b73077b48..a5c95b69e7 100644
--- a/custom_hls/flatten.hpp
+++ b/custom_hls/flatten.hpp
@@ -6,7 +6,7 @@
 
 // Flattens an array of N elements of Type into a single bitvector
 template<long unsigned N, class Type>
-    ap_uint<N * Type::width> flatten(const Type *buffer) {
+    ap_uint<N * Type::width> flatten(const Type buffer[N]) {
 // Inline this small piece of bit merging logic
 #pragma HLS INLINE
         // Fill a flat word of N times the bit-width of the element type
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
index 09bf77a678..37a70fd120 100644
--- a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
@@ -204,15 +204,9 @@ def defines(self, var):
             #   thus adding this to the global includes is not possible.
             '#include "params.hpp"',
             # Input and output HLS stream datatypes
-            "using LhsStream = hls::stream<"
-            f"  ap_uint<{self.get_instream_width(ind=0)}>"
-            ">;",
-            "using RhsStream = hls::stream<"
-            f"  ap_uint<{self.get_instream_width(ind=1)}>"
-            ">;",
-            "using OutStream = hls::stream<"
-            f"  ap_uint<{self.get_outstream_width(ind=0)}>"
-            ">;",
+            "using LhsStream = hls::stream<LhsPacked>;",
+            "using RhsStream = hls::stream<RhsPacked>;",
+            "using OutStream = hls::stream<OutPacked>;",
         ]
 
     # Generates C++ code for reading data from .npy (numpy format) for testing

From f010d18a0260245bfa8b0cc4fd923ca3f0395d5e Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 19 Apr 2024 13:05:22 +0200
Subject: [PATCH 14/26] [Streamline] Fix shape propagation of
 MoveLinearPastEltwiseAdd

Shape propagation when reordering around elementwise addition did not
behave as expected when any of the tensors is broadcast by one of the
reordered operations. This is fixed by deleting and re-doing the shape
annotations for the connecting tensors of the reordered pattern.
---
 src/finn/transformation/streamline/reorder.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index e471eff441..1d6001a381 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -588,6 +588,11 @@ def apply(self, model):
                 if prod0.op_type == "Mul" and prod1.op_type == "Mul":
                     if np.array_equal(init0, init1):
                         self.move_node(graph, n, prod0, prod1, node_ind)
+                        # Delete shape annotations of connecting tensors to be
+                        # re-done later. This prevents wrong shape propagation,
+                        # for example in cases where the Add broadcasts shapes.
+                        model.set_tensor_shape(n.output[0], None)
+                        model.set_tensor_shape(prod0.output[0], None)
                         node_ind -= 1
                         graph_modified = True
                 elif prod0.op_type == "Add" and prod1.op_type == "Add":
@@ -595,12 +600,20 @@ def apply(self, model):
                     # update initializer of prod0, which we'll move
                     model.set_initializer(prod0.input[1], init)
                     self.move_node(graph, n, prod0, prod1, node_ind)
+                    # Delete shape annotations of connecting tensors to be
+                    # re-done later. This prevents wrong shape propagation,
+                    # for example in cases where the Add broadcasts shapes.
+                    model.set_tensor_shape(n.output[0], None)
+                    model.set_tensor_shape(prod0.output[0], None)
                     node_ind -= 1
                     graph_modified = True
                 else:
                     continue
+        # Note: Running shape inference is necessary as shape annotations have
+        # been deleted above
         model = model.transform(InferShapes())
-        return (model, graph_modified)
+        model = model.transform(InferDataTypes())
+        return model, graph_modified
 
 
 class MoveScalarLinearPastInvariants(Transformation):

From 7aaf739b04589422295c6e75a61f1910bb778c48 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 19 Apr 2024 13:10:12 +0200
Subject: [PATCH 15/26] [Tests] Add missing streamlining for testing
 ElementwiseBinaryOperation

Without MoveLinearPastEltwiseAdd the two input streams variant of the
integration test did not actually convert the elementiwse addition to a
hardware operator, effectively "testing" the vanilla ONNX version of the
operator. With this transformation and AbsorbSignBiasIntoMultiThreshold
to get the signs right, the hardware operator is tested as intended now.
---
 tests/fpgadataflow/test_elementwise_binary.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/fpgadataflow/test_elementwise_binary.py b/tests/fpgadataflow/test_elementwise_binary.py
index 57ace2b72f..1aa02bd090 100644
--- a/tests/fpgadataflow/test_elementwise_binary.py
+++ b/tests/fpgadataflow/test_elementwise_binary.py
@@ -83,7 +83,11 @@
 
 # Specific streamlining transformations which needs to be applied manually in
 # integration test
-from finn.transformation.streamline.absorb import AbsorbMulIntoMultiThreshold
+from finn.transformation.streamline.absorb import (
+    AbsorbMulIntoMultiThreshold,
+    AbsorbSignBiasIntoMultiThreshold,
+)
+from finn.transformation.streamline.reorder import MoveLinearPastEltwiseAdd
 
 # Checks whether a node is a fpgadataflow backend node handled by FINN
 from finn.util.fpgadataflow import is_fpgadataflow_node
@@ -433,7 +437,7 @@ def test_elementwise_binary_operation_rtlsim(
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 def test_elementwise_binary_operation_integration_elementwise_add(
-        lhs_shape, rhs_shape, pe, initializers
+        lhs_shape, rhs_shape, initializers, pe
 ):
     # PyTorch model wrapping the component(s) to be tested
     class Dummy(torch.nn.Module):
@@ -489,6 +493,13 @@ def forward(self, *xs):
         # first, to prevent large rounding error due to moving these in front of
         # add operations later.
         model = model.transform(AbsorbMulIntoMultiThreshold())
+        # Need to absorb the sign bias of the quantizer back into the
+        # corresponding thresholds first instead of moving them past the next
+        # operator to avoid sign and range issues.
+        model = model.transform(AbsorbSignBiasIntoMultiThreshold())
+        # There might be identical Mul in front of the joining Add node
+        model = model.transform(MoveLinearPastEltwiseAdd())
+        model = model.transform(AbsorbMulIntoMultiThreshold())
         # Do a single round of standard streamlining of the model graph
         model = model.transform(Streamline())
         # Convert layers to hardware custom operations

From 5268ffe0cd29e8025c9316192aaba6fb93a3a094 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 19 Apr 2024 17:19:07 +0200
Subject: [PATCH 16/26] [Elementwise] Implement bit-width minimization for all
 specializations

This is done mostly according to the Vitis High-Level Synthesis User
Guide (UG1399), see the library reference on arbitrary precision integer
types. The new transformations are added to all relevant test cases and
some data type need to be adjusted to make the numpy references behave
more robust.
---
 .../fpgadataflow/elementwise_binary.py        | 269 +++++++++++++++++-
 tests/fpgadataflow/test_elementwise_binary.py |  52 +++-
 2 files changed, 293 insertions(+), 28 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index 0aee0959bd..309325905e 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -208,18 +208,13 @@ def _execute_node_python(self, context, graph):  # noqa: graph unused
         # Apply elementwise operation with broadcasting in numpy and insert
         # result into the execution context
         # Note: Need to make sure these have the right type for the Numpy API
-        # Note: Assume out_type to be always of the same kind as the inputs but
-        # with bit-width >= the bit-width of either of the inputs. Then,
-        # representing the inputs as out_type for numpy simulation is safe.
-        out = self.npy_op(
-            lhs.astype(self.out_dtype.to_numpy_dt()),
-            rhs.astype(self.out_dtype.to_numpy_dt())
-        )
+        # Note: Always simulate in int64, numpy casting is weird....
+        out = self.npy_op(lhs.astype(np.int64), rhs.astype(np.int64))
         # Make sure the output has the right type, e.g. turn all booleans into
-        # integers if configured by the attribute
+        # integers (actually floats as the container type)
         # Note: This is relevant for logical ops, ==, <=, >=, etc.
         # Note: Somehow QONNX does not like boolean tensors
-        context[node.output[0]] = out.astype(self.out_dtype.to_numpy_dt())
+        context[node.output[0]] = out.astype(np.float32)
 
     # Executes elementwise operation in C++ simulation
     def _execute_node_cppsim(self, context, graph):  # noqa: graph unused
@@ -399,19 +394,91 @@ def get_number_output_values(self):
     # Minimizes the width of the accumulator data type, 'accumulator width' here
     # due to convention, it is actually the output data type
     def minimize_accumulator_width(self, model: ModelWrapper):
+        # If any of the inputs is not an integer, the bit-width cannot be
+        # minimized
+        if not all([self.lhs_dtype.is_integer(), self.rhs_dtype.is_integer()]):
+            # Check the annotated tensor data type corresponds to the stored
+            # attribute
+            assert (model.get_tensor_datatype(self.onnx_node.output[0])
+                    == self.out_dtype), \
+                f"Output type mismatch for {self.onnx_node.name}"
+            # Exit here, returning the not-minimized data type
+            return self.out_dtype
+        # Call the output type derivation specialized by the concrete operator
+        # implementation
+        out_dtype = self._derive_out_dtype(model)
+        # Set the new output data type as attribute
+        self.set_nodeattr("out_dtype", out_dtype.name)
+        # Annotate the output tensor with the new data type
+        model.set_tensor_datatype(self.onnx_node.output[0], out_dtype)
+        # Return the minimized output data type
+        # Note: Probably not required by MinimizeAccumulatorWidth transformation
+        return out_dtype
+
+    # Derives the optimal width of the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
         # Depends on the actual operation performed and must be specialized by
         # the concrete implementations
         raise NotImplementedError(
-            f"minimize_accumulator_width of {self.__class__.__name__}"
+            f"_derive_out_dtype of {self.__class__.__name__}"
             f" is not implemented!"
         )
 
     # Minimizes the width of the weight data type, 'weight' here due to
     # convention, it actually applies to any constant initializer input
     def minimize_weight_bit_width(self, model: ModelWrapper):
-        # TODO: Can actually be implemented in the base class. Without this
-        #  might just be inefficient for now.
-        pass
+        # Check for an initializer providing the left hand side input
+        lhs = model.get_initializer(self.onnx_node.input[0])
+        # If the left hand side input is provided as initializer, minimize the
+        # bits used for storing this
+        if lhs is not None:
+            # Remember the "style" of receiving the input for further code
+            # generation
+            self.set_nodeattr("lhs_style", "const")
+            # Minimum and maximum "weight" on the left hand side, determining
+            # the range of values which needs to be represented
+            _min = lhs.min()
+            _max = lhs.max()
+            # Determine whether signed or unsigned type is required for
+            # representing the weights and select the largest "signed magnitude"
+            _mag = _max if _min > 0 else \
+                _min if (abs(_min) > _max) else (-_max - 1)
+            # Smallest data type large enough to represent this range of values
+            dtype = DataType.get_smallest_possible(_mag)
+            # Update the corresponding data type attribute of the node
+            self.set_nodeattr("lhs_dtype", dtype.name)
+            # Annotate the tensor with the new data type
+            model.set_tensor_datatype(self.onnx_node.input[0], dtype)
+
+        # Check for an initializer providing the right hand side input
+        rhs = model.get_initializer(self.onnx_node.input[1])
+        # If the right hand side input is provided as initializer, minimize the
+        # bits used for storing this
+        if rhs is not None:
+            # Remember the "style" of receiving the input for further code
+            # generation
+            self.set_nodeattr("rhs_style", "const")
+            # Minimum and maximum "weight" on the right hand side, determining
+            # the range of values which needs to be represented
+            _min = rhs.min()
+            _max = rhs.max()
+            assert _min != 0
+            assert _max != 0
+            # Determine whether signed or unsigned type is required for
+            # representing the weights and select the largest "signed magnitude"
+            _mag = _max if _min > 0 else \
+                _min if (abs(_min) > _max) else (-_max - 1)
+            # Smallest data type large enough to represent this range of values
+            dtype = DataType.get_smallest_possible(_mag)
+            # Update the corresponding data type attribute of the node
+            self.set_nodeattr("rhs_dtype", dtype.name)
+            # Annotate the tensor with the new data type
+            model.set_tensor_datatype(self.onnx_node.input[1], dtype)
+
+        # TODO: MVAU returns the data type here, which does not make sense for
+        #  potentially two data types changing and apparently, the
+        #  MinimizeWeightBitWidth transformations does not even use the returned
+        #  value.
 
 
 # Derive a specialization to implement elementwise addition of two inputs
@@ -421,6 +488,38 @@ class ElementwiseAdd(ElementwiseBinaryOperation):
     # hand side input
     _operation = "Add", np.add, "({0} + {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs and the larger of the
+        # two widths
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        max_width = max(lhs_width, rhs_width)
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # By default, the output is one bit more than the widest of the inputs
+        out_width = max_width + 1
+        # If the addition is signed, the output might be wider depending on
+        # which of the inputs is signed
+        if signed:
+            # Find the wider and narrower of the two inputs by assuming left to
+            # right order first
+            wider, narrower = self.lhs_dtype, self.rhs_dtype
+            # Swap if the order is not correct
+            if narrower.bitwidth() > wider.bitwidth():
+                wider, narrower = narrower, wider
+            # If and only if the wider is unsigned and the narrower is signed,
+            # add two bits to the output width
+            if not wider.signed() and narrower.signed():
+                # Out has two bits more than the widest input
+                out_width = max_width + 2
+            # The new output type is a signed integer of the calculated
+            # bit-width
+            return DataType[f"INT{out_width}"]
+        # By default, if both inputs are unsigned, the output is unsigned as
+        # well
+        return DataType[f"UINT{out_width}"]
+
 
 # Derive a specialization to implement elementwise subtraction of two inputs
 @register_custom_op
@@ -429,6 +528,34 @@ class ElementwiseSub(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Sub", np.subtract, "({0} - {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs and the larger of the
+        # two widths
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        max_width = max(lhs_width, rhs_width)
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # By default, the output is one bit more than the widest of the inputs
+        out_width = max_width + 1
+        # If the operation is signed, the output might be wider depending on
+        # which of the inputs is signed
+        if signed:
+            # Find the wider and narrower of the two inputs by assuming left to
+            # right order first
+            wider, narrower = self.lhs_dtype, self.rhs_dtype
+            # Swap if the order is not correct
+            if narrower.bitwidth() > wider.bitwidth():
+                wider, narrower = narrower, wider
+            # If and only if the wider is unsigned and the narrower is signed,
+            # add two bits to the output width
+            if not wider.signed() and narrower.signed():
+                # Out has two bits more than the widest input
+                out_width = max_width + 2
+        # For subtraction, the output data type is always signed
+        return DataType[f"INT{out_width}"]
+
 
 # Derive a specialization to implement elementwise multiplication of two inputs
 @register_custom_op
@@ -437,6 +564,19 @@ class ElementwiseMul(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Mul", np.multiply, "({0} * {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # The width of the product is the sum of the widths of the operands.
+        out_width = lhs_width + rhs_width
+        # The product is treated as a signed type if either of the operands is
+        # of a signed type.
+        return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
 
 # Derive a specialization to implement elementwise division of two inputs
 @register_custom_op
@@ -446,6 +586,20 @@ class ElementwiseDiv(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Div", np.divide, "({0} / {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs
+        lhs_width = self.lhs_dtype.bitwidth()
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # The width of the quotient is the width of the dividend if the divisor
+        # is an unsigned type. Otherwise, it is the width of the dividend plus
+        # one.
+        out_width = lhs_width if not self.rhs_dtype.signed() else lhs_width + 1
+        # The quotient is treated as a signed type if either of the operands is
+        # of a signed type.
+        return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
 
 # TODO: ElementwiseMod - Requires extra attribute selecting the function
 
@@ -457,6 +611,12 @@ class ElementwiseAnd(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "And", np.logical_and, "({0} && {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise logical or of two inputs
 @register_custom_op
@@ -465,6 +625,12 @@ class ElementwiseOr(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Or", np.logical_or, "({0} || {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise logical xor of two inputs
 @register_custom_op
@@ -473,6 +639,12 @@ class ElementwiseXor(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Xor", np.logical_xor, "(bool({0}) != bool({1}))", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise equality of two inputs
 @register_custom_op
@@ -481,6 +653,12 @@ class ElementwiseEqual(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Equal", np.equal, "({0} == {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise less of two inputs
 @register_custom_op
@@ -489,6 +667,12 @@ class ElementwiseLess(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "Less", np.less, "({0} < {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise less or equal of two inputs
 @register_custom_op
@@ -497,6 +681,12 @@ class ElementwiseLessOrEqual(ElementwiseBinaryOperation):
     # side and right hand side input
     _operation = "LessOrEqual", np.less_equal, "({0} <= {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise greater of two inputs
 @register_custom_op
@@ -505,6 +695,12 @@ class ElementwiseGreater(ElementwiseBinaryOperation):
     # and right hand side input
     _operation = "Greater", np.greater, "({0} > {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise greater or equal of two
 # inputs
@@ -514,6 +710,12 @@ class ElementwiseGreaterOrEqual(ElementwiseBinaryOperation):
     # hand side and right hand side input
     _operation = "GreaterOrEqual", np.greater_equal, "({0} >= {1})", None
 
+    # Derives the output data type
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Treat the boolean output of a logical operation as unsigned integer of
+        # width 1, i.e., a single bit True/False
+        return DataType["BINARY"]
+
 
 # Derive a specialization to implement elementwise bitwise and of two inputs
 @register_custom_op
@@ -522,6 +724,20 @@ class ElementwiseBitwiseAnd(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "BitwiseAnd", np.bitwise_and, "({0} & {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs  # noqa: Duplicate
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # The bitwise logical operators all return a value with a width that is
+        # the maximum of the widths of the two operands.
+        out_width = max(lhs_width, rhs_width)
+        # The product is treated as a signed type if either of the operands is
+        # of a signed type.
+        return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
 
 # Derive a specialization to implement elementwise bitwise or of two inputs
 @register_custom_op
@@ -530,6 +746,20 @@ class ElementwiseBitwiseOr(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "BitwiseOr", np.bitwise_or, "({0} | {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs  # noqa: Duplicate
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # The bitwise logical operators all return a value with a width that is
+        # the maximum of the widths of the two operands.
+        out_width = max(lhs_width, rhs_width)
+        # The product is treated as a signed type if either of the operands is
+        # of a signed type.
+        return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
+
 
 # Derive a specialization to implement elementwise bitwise xor of two inputs
 @register_custom_op
@@ -538,6 +768,19 @@ class ElementwiseBitwiseXor(ElementwiseBinaryOperation):
     # right hand side input
     _operation = "BitwiseXor", np.bitwise_xor, "({0} ^ {1})", None
 
+    # Derives the output data type according to UG1399
+    def _derive_out_dtype(self, model: ModelWrapper):
+        # Get the width of the data types of the inputs  # noqa: Duplicate
+        lhs_width = self.lhs_dtype.bitwidth()
+        rhs_width = self.rhs_dtype.bitwidth()
+        # Check whether the addition operation is a signed addition
+        signed = any([self.lhs_dtype.signed(), self.rhs_dtype.signed()])
+        # The bitwise logical operators all return a value with a width that is
+        # the maximum of the widths of the two operands.
+        out_width = max(lhs_width, rhs_width)
+        # The product is treated as a signed type if either of the operands is
+        # of a signed type.
+        return DataType[f"INT{out_width}" if signed else f"UINT{out_width}"]
 
 # TODO: ElementwiseBitShift - Requires extra attribute selecting the direction
 
diff --git a/tests/fpgadataflow/test_elementwise_binary.py b/tests/fpgadataflow/test_elementwise_binary.py
index 1aa02bd090..7b3bf65e87 100644
--- a/tests/fpgadataflow/test_elementwise_binary.py
+++ b/tests/fpgadataflow/test_elementwise_binary.py
@@ -68,7 +68,16 @@
     InferElementwiseBinaryOperation,
     InferThresholdingLayer,
 )
+# Synthesizes HLS code generated from an operator to IP block
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+# Bit-width optimization transformations
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
+# Transformations preparing the operators for C++ and RTL simulation
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
@@ -240,6 +249,11 @@ def test_elementwise_binary_operation_python(
     # Test running shape and data type inference on the model graph
     model = model.transform(InferDataTypes())
     model = model.transform(InferShapes())
+
+    # Try to minimize the bit-widths of all data types involved
+    model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(MinimizeAccumulatorWidth())
+
     # Set model execution mode to python simulation
     model = model.transform(SetExecMode("python"))
     model = model.transform(GiveUniqueNodeNames())
@@ -247,11 +261,9 @@ def test_elementwise_binary_operation_python(
     # Compute ground-truth output in software
     o_expected = numpy_reference(
         # Note: Need to make sure these have the right type for the Numpy API
-        # Note: Assume out_type to be always of the same kind as the inputs but
-        # with bit-width >= the bit-width of either of the inputs. Then,
-        # representing the inputs as out_type for numpy simulation is safe.
-        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
-        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        # Note: Assume all test cases fit into int64 without loss of precision
+        context["lhs"].astype(np.int64),
+        context["rhs"].astype(np.int64)
     )
     # Execute the onnx model to collect the result
     o_produced = execute_onnx(model, context)["out"]
@@ -314,6 +326,11 @@ def test_elementwise_binary_operation_cppsim(
     model = model.transform(InferShapes())
     # Specializes all nodes to be implemented as HLS backend
     model = specialize_hls(model)
+
+    # Try to minimize the bit-widths of all data types involved
+    model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(MinimizeAccumulatorWidth())
+
     # Set model execution mode to C++ simulation
     model = model.transform(SetExecMode("cppsim"))
     # Generates the C++ source and compiles the C++ simulation
@@ -324,11 +341,9 @@ def test_elementwise_binary_operation_cppsim(
     # Compute ground-truth output in software
     o_expected = numpy_reference(
         # Note: Need to make sure these have the right type for the Numpy API
-        # Note: Assume out_type to be always of the same kind as the inputs but
-        # with bit-width >= the bit-width of either of the inputs. Then,
-        # representing the inputs as out_type for numpy simulation is safe.
-        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
-        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        # Note: Assume all test cases fit into int64 without loss of precision
+        context["lhs"].astype(np.int64),
+        context["rhs"].astype(np.int64)
     )
     # Execute the onnx model to collect the result
     o_produced = execute_onnx(model, context)["out"]
@@ -391,6 +406,11 @@ def test_elementwise_binary_operation_rtlsim(
     model = model.transform(InferShapes())
     # Specializes all nodes to be implemented as HLS backend
     model = specialize_hls(model)
+
+    # Try to minimize the bit-widths of all data types involved
+    model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(MinimizeAccumulatorWidth())
+
     # Set model execution mode to RTL simulation
     model = model.transform(SetExecMode("rtlsim"))
     # Generates the C++ source and compiles the RTL simulation
@@ -402,11 +422,9 @@ def test_elementwise_binary_operation_rtlsim(
     # Compute ground-truth output in software
     o_expected = numpy_reference(
         # Note: Need to make sure these have the right type for the Numpy API
-        # Note: Assume out_type to be always of the same kind as the inputs but
-        # with bit-width >= the bit-width of either of the inputs. Then,
-        # representing the inputs as out_type for numpy simulation is safe.
-        context["lhs"].astype(DataType[out_dtype].to_numpy_dt()),
-        context["rhs"].astype(DataType[out_dtype].to_numpy_dt()),
+        # Note: Assume all test cases fit into int64 without loss of precision
+        context["lhs"].astype(np.int64),
+        context["rhs"].astype(np.int64)
     )
     # Execute the onnx model to collect the result
     o_produced = execute_onnx(model, context)["out"]
@@ -514,6 +532,10 @@ def forward(self, *xs):
             "Defaults": {"PE": [pe, ["ElementwiseAdd", "Thresholding"]]}
         }))
 
+        # Try to minimize the bit-widths of all data types involved
+        model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(MinimizeAccumulatorWidth())
+
         # Prepare the execution context with dummy data from above and input
         # node names extracted from transformed modelo graph
         context = {}

From 4769d8e23a32a823205e5a85d7f3128053acfff0 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 19 Apr 2024 21:21:05 +0200
Subject: [PATCH 17/26] [Elementwise] Add support for floating-point operations

This depends on adding float support support to Slice in finn-hlslib.
---
 custom_hls/flatten.hpp                        |  21 ++
 .../fpgadataflow/elementwise_binary.py        |   9 +-
 tests/fpgadataflow/test_elementwise_binary.py | 227 ++++++++++++++++++
 3 files changed, 254 insertions(+), 3 deletions(-)

diff --git a/custom_hls/flatten.hpp b/custom_hls/flatten.hpp
index a5c95b69e7..ccb15d5af6 100644
--- a/custom_hls/flatten.hpp
+++ b/custom_hls/flatten.hpp
@@ -23,4 +23,25 @@ template<long unsigned N, class Type>
         return flat;
     }
 
+// Flattens an array of N elements of float into a single bitvector
+template<long unsigned N>
+    ap_uint<N * 32> flatten(const float buffer[N]) {
+// Inline this small piece of bit merging logic
+#pragma HLS INLINE
+        // Fill a flat word of N times the bit-width of the element type
+        ap_uint<N * 32> flat;
+        // Merge all N chunks of the tile into the flat bitvector
+        for(unsigned j = 0; j < N; ++j) {
+// Do the merging of all chunks in parallel
+#pragma HLS UNROLL
+            // Insert the chunk into the right place of the
+            // bitvector
+            flat((j + 1) * 32 - 1, j * 32) =
+                // Note: Reinterpret the float as a 32-bit unsigned bit-vector
+                *reinterpret_cast<const ap_uint<32>*>(&buffer[j]);
+        }
+        // Return the buffer flattened into a single bitvector
+        return flat;
+    }
+
 #endif // FLATTEN_HPP
diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index 309325905e..c9baf5a1dc 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -205,11 +205,14 @@ def _execute_node_python(self, context, graph):  # noqa: graph unused
         # Get the inputs out of the execution context
         lhs = context[node.input[0]]
         rhs = context[node.input[1]]
+        # Note: Need to make sure these have the right type for the Numpy API
+        # Note: Always simulate integer inputs in int64, numpy casting is
+        # weird....
+        lhs = lhs.astype(np.int64) if self.lhs_dtype.is_integer() else lhs
+        rhs = rhs.astype(np.int64) if self.rhs_dtype.is_integer() else rhs
         # Apply elementwise operation with broadcasting in numpy and insert
         # result into the execution context
-        # Note: Need to make sure these have the right type for the Numpy API
-        # Note: Always simulate in int64, numpy casting is weird....
-        out = self.npy_op(lhs.astype(np.int64), rhs.astype(np.int64))
+        out = self.npy_op(lhs, rhs)
         # Make sure the output has the right type, e.g. turn all booleans into
         # integers (actually floats as the container type)
         # Note: This is relevant for logical ops, ==, <=, >=, etc.
diff --git a/tests/fpgadataflow/test_elementwise_binary.py b/tests/fpgadataflow/test_elementwise_binary.py
index 7b3bf65e87..0222be62a4 100644
--- a/tests/fpgadataflow/test_elementwise_binary.py
+++ b/tests/fpgadataflow/test_elementwise_binary.py
@@ -141,6 +141,11 @@ def specialize_hls(model: ModelWrapper):
     # TODO: "ElementwisePow": np.power
 }
 
+# Names of bitwise operations which somtimes require special treatment
+BITWISE = [
+    "ElementwiseBitwiseAnd", "ElementwiseBitwiseOr", "ElementwiseBitwiseXor"
+]
+
 
 # Creates a model executing a binary elementwise operation
 def mock_elementwise_binary_operation(
@@ -272,6 +277,74 @@ def test_elementwise_binary_operation_python(
     assert np.all(o_produced == o_expected)
 
 
+# Operator type to be tested
+@pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+    # Test all Numpy references specified above, except for the bitwise
+    # operations, for which floating-point doe not make sense
+    *sorted((NUMPY_REFERENCES.keys() - BITWISE)),
+])
+# Data type of the left-hand-side input elements
+@pytest.mark.parametrize("lhs_dtype", ["FLOAT32"])
+# Data type of the right-hand-side input elements
+@pytest.mark.parametrize("rhs_dtype", ["FLOAT32"])
+# Data type of the output elements
+@pytest.mark.parametrize("out_dtype", ["FLOAT32"])
+# Shape of the left-hand-side input
+@pytest.mark.parametrize("lhs_shape", [
+    [3, 1, 7, 1], [1]
+])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [
+    [3, 32, 1, 16],
+])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [
+    [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+def test_elementwise_binary_operation_float_python(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+        initializers
+):
+    # Make dummy model for testing
+    model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+    )
+    # Prepare the execution context
+    context = {
+        "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+        "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+    }
+
+    # Turn selected inputs into initializers
+    for name in initializers:
+        model.set_initializer(name, context[name])
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
+
+    # Test running shape and data type inference on the model graph
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferShapes())
+
+    # Try to minimize the bit-widths of all data types involved
+    model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(MinimizeAccumulatorWidth())
+
+    # Set model execution mode to python simulation
+    model = model.transform(SetExecMode("python"))
+    model = model.transform(GiveUniqueNodeNames())
+
+    # Compute ground-truth output in software
+    o_expected = numpy_reference(context["lhs"], context["rhs"])
+    # Execute the onnx model to collect the result
+    o_produced = execute_onnx(model, context)["out"]
+
+    # Compare the expected to the produced for exact equality
+    assert np.all(o_produced == o_expected)
+
+
 # Operator type to be tested
 @pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
     # Test all Numpy references specified above
@@ -352,6 +425,82 @@ def test_elementwise_binary_operation_cppsim(
     assert np.all(o_produced == o_expected)
 
 
+# Operator type to be tested
+@pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+    # Test all Numpy references specified above, except for the bitwise
+    # operations, for which floating-point doe not make sense
+    *sorted((NUMPY_REFERENCES.keys() - BITWISE)),
+])
+# Data type of the left-hand-side input elements
+@pytest.mark.parametrize("lhs_dtype", ["FLOAT32"])
+# Data type of the right-hand-side input elements
+@pytest.mark.parametrize("rhs_dtype", ["FLOAT32"])
+# Data type of the output elements
+@pytest.mark.parametrize("out_dtype", ["FLOAT32"])
+# Shape of the left-hand-side input
+@pytest.mark.parametrize("lhs_shape", [
+    [3, 1, 7, 1], [1]
+])
+# Shape of the right-hand-side input
+@pytest.mark.parametrize("rhs_shape", [
+    [3, 32, 1, 16],
+])
+# Which inputs to set as initializers
+@pytest.mark.parametrize("initializers", [
+    [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+])
+# Number of elements to process in parallel
+@pytest.mark.parametrize("pe", [1, 2, 4])
+# This is a slow running fpgadataflow type of test which requires vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+def test_elementwise_binary_operation_float_cppsim(
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+        initializers
+):
+    # Make dummy model for testing
+    model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+        op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+    )
+    # Prepare the execution context
+    context = {
+        "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+        "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+    }
+
+    # Turn selected inputs into initializers
+    for name in initializers:
+        model.set_initializer(name, context[name])
+
+    # Get the numpy reference implementation for this operation
+    numpy_reference = NUMPY_REFERENCES[op_type]
+
+    # Test running shape and data type inference on the model graph
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferShapes())
+    # Specializes all nodes to be implemented as HLS backend
+    model = specialize_hls(model)
+
+    # Try to minimize the bit-widths of all data types involved
+    model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(MinimizeAccumulatorWidth())
+
+    # Set model execution mode to C++ simulation
+    model = model.transform(SetExecMode("cppsim"))
+    # Generates the C++ source and compiles the C++ simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+
+    # Compute ground-truth output in software
+    o_expected = numpy_reference(context["lhs"], context["rhs"])
+    # Execute the onnx model to collect the result
+    o_produced = execute_onnx(model, context)["out"]
+
+    # Compare the expected to the produced for exact equality
+    assert np.all(o_produced == o_expected)
+
+
 # Operator type to be tested
 @pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
     # Test all Numpy references specified above
@@ -433,6 +582,84 @@ def test_elementwise_binary_operation_rtlsim(
     assert np.all(o_produced == o_expected)
 
 
+# TODO: No floating-point support in RTL simulation
+# # Operator type to be tested
+# @pytest.mark.parametrize("op_type", [  # noqa: Duplicate test setup
+#     # Test all Numpy references specified above, except for the bitwise
+#     # operations, for which floating-point doe not make sense
+#     *sorted((NUMPY_REFERENCES.keys() - BITWISE)),
+# ])
+# # Data type of the left-hand-side input elements
+# @pytest.mark.parametrize("lhs_dtype", ["FLOAT32"])
+# # Data type of the right-hand-side input elements
+# @pytest.mark.parametrize("rhs_dtype", ["FLOAT32"])
+# # Data type of the output elements
+# @pytest.mark.parametrize("out_dtype", ["FLOAT32"])
+# # Shape of the left-hand-side input
+# @pytest.mark.parametrize("lhs_shape", [
+#     [3, 1, 7, 1], [1]
+# ])
+# # Shape of the right-hand-side input
+# @pytest.mark.parametrize("rhs_shape", [
+#     [3, 32, 1, 16],
+# ])
+# # Which inputs to set as initializers
+# @pytest.mark.parametrize("initializers", [
+#     [], ["lhs"], ["rhs"], ["lhs", "rhs"]
+# ])
+# # Number of elements to process in parallel
+# @pytest.mark.parametrize("pe", [1, 2, 4])
+# # This is a slow running fpgadataflow type of test which requires vivado
+# @pytest.mark.fpgadataflow
+# @pytest.mark.slow
+# def test_elementwise_binary_operation_float_rtlsim(
+#         op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe,
+#         initializers
+# ):
+#     # Make dummy model for testing
+#     model = mock_elementwise_binary_operation(  # noqa: Duplicate test setup
+#         op_type, lhs_dtype, rhs_dtype, out_dtype, lhs_shape, rhs_shape, pe
+#     )
+#     # Prepare the execution context
+#     context = {
+#         "lhs": gen_finn_dt_tensor(DataType[lhs_dtype], lhs_shape),
+#         "rhs": gen_finn_dt_tensor(DataType[rhs_dtype], rhs_shape)
+#     }
+#
+#     # Turn selected inputs into initializers
+#     for name in initializers:
+#         model.set_initializer(name, context[name])
+#
+#     # Get the numpy reference implementation for this operation
+#     numpy_reference = NUMPY_REFERENCES[op_type]
+#
+#     # Test running shape and data type inference on the model graph
+#     model = model.transform(InferDataTypes())
+#     model = model.transform(InferShapes())
+#     # Specializes all nodes to be implemented as HLS backend
+#     model = specialize_hls(model)
+#
+#     # Try to minimize the bit-widths of all data types involved
+#     model = model.transform(MinimizeWeightBitWidth())
+#     model = model.transform(MinimizeAccumulatorWidth())
+#
+#     # Set model execution mode to RTL simulation
+#     model = model.transform(SetExecMode("rtlsim"))
+#     # Generates the C++ source and compiles the RTL simulation
+#     model = model.transform(GiveUniqueNodeNames())
+#     model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10))  # noqa
+#     model = model.transform(HLSSynthIP())
+#     model = model.transform(PrepareRTLSim())
+#
+#     # Compute ground-truth output in software
+#     o_expected = numpy_reference(context["lhs"], context["rhs"])
+#     # Execute the onnx model to collect the result
+#     o_produced = execute_onnx(model, context)["out"]
+#
+#     # Compare the expected to the produced for exact equality
+#     assert np.all(o_produced == o_expected)
+
+
 # Test-case setting up a complete dummy model containing various elementwise
 # binary operations in PyTorch, converting to QONNX and verifying in Python, C++
 # and RTL simulation

From 87fc002f0162534eef397187982b4bca89481a4f Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 3 May 2024 18:57:41 +0200
Subject: [PATCH 18/26] [Elementwise] Implement get_exp_cycles for
 ElementwiseBinaryOperation

---
 src/finn/custom_op/fpgadataflow/elementwise_binary.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index c9baf5a1dc..f8218e38e2 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -234,7 +234,7 @@ def _execute_node_rtlsim(self, context, graph):  # noqa: graph unused
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         # Get the inputs out of the execution context
         lhs = context[node.input[0]]  # noqa: Duplicate code prepare simulation
-        rhs = context[node.input[1]]
+        rhs = context[node.input[1]]  # noqa: Duplicate code prepare simulation
         # Validate the shape of the inputs
         assert list(lhs.shape) == self.get_normal_input_shape(ind=0), \
             f"Input shape mismatch for {node.input[0]}"
@@ -278,7 +278,7 @@ def _execute_node_rtlsim(self, context, graph):  # noqa: graph unused
             )
 
         # Setup PyVerilator simulation of the node
-        sim = self.get_rtlsim()
+        sim = self.get_rtlsim()  # noqa: Duplicate code prepare simulation
         # Reset the RTL simulation
         super().reset_rtlsim(sim)
         super().toggle_clk(sim)
@@ -483,6 +483,13 @@ def minimize_weight_bit_width(self, model: ModelWrapper):
         #  MinimizeWeightBitWidth transformations does not even use the returned
         #  value.
 
+    # Derives the expected cycles for the elementwise binary operation given the
+    # folding configuration
+    def get_exp_cycles(self):
+        # Number of iterations required to process the whole folded input stream
+        #   Note: This is all but the PE (last, parallelized) dimension
+        return np.prod(self.get_folded_output_shape()[:-1])
+
 
 # Derive a specialization to implement elementwise addition of two inputs
 @register_custom_op

From efb1cc94a77208996cdb9f770037bbf711356d39 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 3 May 2024 19:04:25 +0200
Subject: [PATCH 19/26] [Elementwise] Add support for
 ElementwiseBinaryOperation to SetFolding

---
 .../fpgadataflow/set_folding.py               | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index eaee499e6a..0ae425975c 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -27,12 +27,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+# Inspect information on Python objects like modules
+import inspect
 import numpy as np
 import warnings
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveUniqueNodeNames
 
+# Import the elementwise binary operation module to extract names of all
+# specializations (which require PE parallelism to be configured)
+import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls as elementwise_binary_hls
 from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
@@ -44,6 +49,15 @@ def divisors(num):
             yield x
 
 
+# Find the op-type names for all HLS specializations of elementwise binary
+# operations
+ELEMENTWISE_BINARY_OPS = [
+    op_type
+    for op_type, cls in inspect.getmembers(elementwise_binary_hls, inspect.isclass)
+    if issubclass(cls, elementwise_binary_hls.ElementwiseBinaryOperation_hls)
+]
+
+
 class SetFolding(Transformation):
     """Attempt to set parallelism attributes in all nodes to meet a specific
     target expressed as cycles per frame target_cycles_per_frame. For each
@@ -106,6 +120,7 @@ def apply(self, model):
             "GlobalAccPool_hls",
             "Thresholding_hls",
             "Thresholding_rtl",
+            *ELEMENTWISE_BINARY_OPS,
         ]
         # these ops use SIMD parallelism, up to a max value of NumChannels
         # ConvolutionInputGenerator* has a special case when depthwise=1
@@ -151,7 +166,16 @@ def apply(self, model):
                 # increase PE until target met or reached max_pe
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type in pe_ops:
-                max_pe = node_inst.get_nodeattr("NumChannels")
+                # Note: Keep original behavior for all custom-ops defining the
+                # NumChannels attribute as it is
+                try:
+                    max_pe = node_inst.get_nodeattr("NumChannels")
+                # Note: Some of the recent additions do not define the
+                # NumChannels attribute
+                except AttributeError:
+                    # We can extract the channels from the normal, i.e., not
+                    # folded, shape of the input in these cases
+                    max_pe = node_inst.get_normal_input_shape()[-1]
                 self.optimize_attribute_val(node_inst, max_pe, "PE")
             elif op_type == "LabelSelect_hls":
                 max_pe = node_inst.get_nodeattr("Labels")

From f34dcfcde3bf93c34d6c14065636c6920dfac5ba Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 10 May 2024 17:21:30 +0200
Subject: [PATCH 20/26] [Elementwise] Remove FIFO depths attribute overloads

---
 src/finn/custom_op/fpgadataflow/elementwise_binary.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index f8218e38e2..740854ec72 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -87,11 +87,7 @@ def get_nodeattr_types(self):
             #   Note: Override to support python mode
             "exec_mode": (
                 "s", False, "python", {"", "rtlsim", "cppsim", "python"}
-            ),
-            # Input and output FIFO depths for multi-I/O nodes
-            #   Note: Need to override here as there multiple outputs
-            "inFIFODepths": ("ints", False, [2, 2]),
-            "outFIFODepths": ("ints", False, []),  # Default will be override
+            )
         })
         # Return updated attribute dictionary
         return attrs

From e361cb91b600cdc51c18ecd621bb7eb9a48fdea1 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Fri, 17 May 2024 12:22:29 +0200
Subject: [PATCH 21/26] [Elementwise] Add ARRAY_PARTITION and BIND_STORAGE
 directives

---
 .../fpgadataflow/elementwise_binary.py        |  6 +-
 .../hls/elementwise_binary_hls.py             | 73 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index 740854ec72..ac96e649aa 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -87,7 +87,11 @@ def get_nodeattr_types(self):
             #   Note: Override to support python mode
             "exec_mode": (
                 "s", False, "python", {"", "rtlsim", "cppsim", "python"}
-            )
+            ),
+            # FPGA resource type for memories/internal buffers of the operator
+            "ram_style": (
+                "s", False, "auto", {"auto", "block", "distributed", "ultra"}
+            ),
         })
         # Return updated attribute dictionary
         return attrs
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
index 37a70fd120..cffb964baf 100644
--- a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
@@ -32,6 +32,12 @@
     ElementwiseBinaryOperation
 )
 
+# Mapping of memory resource attributes to the corresponding C++ HLS
+# pragma directives
+RAM_STYLES = {
+    "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", "ultra": "URAM"
+}
+
 
 # HLS Backend specialization of the binary elementwise operation operator
 class ElementwiseBinaryOperation_hls(  # noqa: Class name does not follow
@@ -111,6 +117,18 @@ def generate_params(self, model: ModelWrapper, path: str):
         lhs = model.get_initializer(self.onnx_node.input[0])
         # Folded output shape for broadcasting/aligning the input shapes
         out_shape = self.get_folded_output_shape(ind=0)
+        # Type of memory to use for storing constant parameters
+        ram_style = RAM_STYLES[self.get_nodeattr("ram_style")]
+
+        # Check whether there are already pragmas in the code generation
+        # dictionary
+        if "$PRAGMAS$" not in self.code_gen_dict:
+            # If not, insert an empty list to collect more pragmas
+            # Note: Do this here as it is easier to add the array partition and
+            # bind storage pragmas for generated parameter here, where the shape
+            # is computed.
+            self.code_gen_dict["$PRAGMAS$"] = []
+
         # If the left hand side input is provided as initializer, generate
         # initializer parameters code
         if lhs is not None:
@@ -139,6 +157,18 @@ def generate_params(self, model: ModelWrapper, path: str):
             lhs_code = numpy_to_hls_code(
                 lhs, self.lhs_dtype, "lhs", False, False
             )
+            # Add pragma configuring the storage type to use for the parameter
+            # tensors: This is a constant parameter implemented as dual-port ROM
+            self.code_gen_dict["$PRAGMAS$"].append(
+                f"#pragma HLS BIND_STORAGE"
+                f" variable=lhs type=ROM_2P impl={ram_style}"
+            )
+            # Add pragma to partition the parameter tensor along the last
+            # dimensions, i.e., the PE dimension for parallel access
+            self.code_gen_dict["$PRAGMAS$"].append(
+                f"#pragma HLS ARRAY_PARTITION"
+                f" variable=lhs complete dim={len(lhs_shape)}"
+            )
 
         # Check for an initializer providing the right hand side input
         rhs = model.get_initializer(self.onnx_node.input[1])
@@ -170,6 +200,18 @@ def generate_params(self, model: ModelWrapper, path: str):
             rhs_code = numpy_to_hls_code(
                 rhs, self.rhs_dtype, "rhs", False, False
             )
+            # Add pragma configuring the storage type to use for the parameter
+            # tensors: This is a constant parameter implemented as dual-port ROM
+            self.code_gen_dict["$PRAGMAS$"].append(
+                f"#pragma HLS BIND_STORAGE"
+                f" variable=rhs type=ROM_2P impl={ram_style}"
+            )
+            # Add pragma to partition the parameter tensor along the last
+            # dimensions, i.e., the PE dimension for parallel access
+            self.code_gen_dict["$PRAGMAS$"].append(
+                f"#pragma HLS ARRAY_PARTITION"
+                f" variable=rhs complete dim={len(rhs_shape)}"
+            )
 
         # Open a file to store the thresholds parameters as C++ code
         with open(f"{code_gen_dir}/params.hpp", "w") as file:
@@ -327,6 +369,11 @@ def make_index_string(shape):
         lhs_buffer_shape = "".join([f'[{size}]' for size in lhs_buffer_shape])
         rhs_buffer_shape = "".join([f'[{size}]' for size in rhs_buffer_shape])
 
+        # Number of dimensions of the (broadcast) output. All shapes will be
+        # aligned to this number of dimensions.
+        # Note: +1 for the PE dimension
+        ndim = len(out_shape) + 1
+
         # For-Loop template for nested loops over arbitrary many levels
         def for_loop(level, size):
             return f"for(std::size_t i{level} = 0; i{level}<{size}; ++i{level})"
@@ -362,21 +409,37 @@ def unpack_buffer(shape):
             # elements to be unpacked
             return "buffer(pe, 0)"
 
+        # Type of memory to use for storing constant parameters
+        ram_style = RAM_STYLES[self.get_nodeattr("ram_style")]
+
         # Write the body of the top-level function
         self.code_gen_dict["$DOCOMPUTE$"] = [
             # @formatter:off  Disable formatter for mixed Python and C++
             # For streamed inputs, generate local buffer of non-broadcast size
             # but broadcasts dimensions un-squeezed to size 1. For constant
             # inputs, use the generated parameters of the same name.
+            # For streamed inputs, implement a simple dual-port RAM partitioned
+            # on the last, i.e., the PE, axis for parallel access.
             f"""
             LhsType lhs{lhs_buffer_shape}[{self.pe}];
+            #pragma HLS ARRAY_PARTITION variable=lhs complete dim={ndim}
+            #pragma HLS BIND_STORAGE variable=lhs type=RAM_S2P impl={ram_style}
             """ if self.lhs_style == "input" else """""",
             f"""
             RhsType rhs{rhs_buffer_shape}[{self.pe}];
+            #pragma HLS ARRAY_PARTITION variable=rhs complete dim={ndim}
+            #pragma HLS BIND_STORAGE variable=rhs type=RAM_S2P impl={ram_style}
             """ if self.rhs_style == "input" else """""",
-            # Buffer to hold the parallel output elements
+            # Buffer to hold the parallel output elements: Implement a simple
+            # dual-port RAM for the output buffer, partitioned on the last,
+            # i.e., the PE, axis for parallel access.
+            # Note: The PE output should be rather small, force this into
+            # distributed memory here.
+            # TODO: Maybe reconsider this later?
             f"""
             OutType out[{self.pe}];
+            #pragma HLS ARRAY_PARTITION variable=out complete dim=1
+            #pragma HLS BIND_STORAGE variable=out type=RAM_S2P impl=LUTRAM
             """,
             # Perfect loop nest over all folded output dimensions
             *[for_loop(dim, size) + " {" for dim, size in enumerate(out_shape)],
@@ -487,9 +550,15 @@ def blackboxfunction(self):
     # Generates C++ pragmas to be inserted into the main function of the C++
     # simulation and the ipgen-blackboxfunction as well
     def pragmas(self):
+        # Check whether there are already pragmas in the code generation
+        # dictionary
+        if "$PRAGMAS$" not in self.code_gen_dict:
+            # If not, insert an empty list to collect more pragmas
+            self.code_gen_dict["$PRAGMAS$"] = []
+
         # Add HLS interface directives specifying how to create RTL ports for
         # the top-level function arguments
-        self.code_gen_dict["$PRAGMAS$"] = [
+        self.code_gen_dict["$PRAGMAS$"] += [
             # Connect the output stream with an axi stream interface
             f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}",
         ]

From 653673b4705c3b580925650d3c086fcf73641e6e Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Thu, 8 Aug 2024 16:05:34 +0200
Subject: [PATCH 22/26] [Streamline] Prevent FactorOutMulSignMagnitude from
 handling join-nodes

Join-node Mul operations have no intitializer (parameters) and thus
there is nothing to factor out.
---
 src/finn/transformation/streamline/absorb.py | 22 ++++++++------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py
index fc09d2dedd..4c280d8f28 100644
--- a/src/finn/transformation/streamline/absorb.py
+++ b/src/finn/transformation/streamline/absorb.py
@@ -30,17 +30,19 @@
 import qonnx.core.data_layout as DataLayout
 import warnings
 from onnx import helper as oh
-# Protobuf onnx graph node type
-from onnx import NodeProto  # noqa
+from qonnx.core.datatype import DataType
+
 # QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.core.datatype import DataType
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import get_by_name
 
+# Protobuf onnx graph node type
+from onnx import NodeProto  # noqa
+
 
 class AbsorbSignBiasIntoMultiThreshold(Transformation):
     """Absorb scalar bias originating from signed int export back into
@@ -109,14 +111,10 @@ def apply(self, model):
 def group_inputs_by_category(node: NodeProto, model: ModelWrapper):  # noqa
     # First select all dynamic inputs, which are those without initializer
     # tensor
-    dynamics = [
-        i for i in node.input if model.get_initializer(i) is None
-    ]
+    dynamics = [i for i in node.input if model.get_initializer(i) is None]
     # Select all input which are initializers, which, by exclusion, are all
     # those not among the dynamic inputs
-    initializers = [
-        i for i in node.input if i not in dynamics
-    ]
+    initializers = [i for i in node.input if i not in dynamics]
     # Return lists of dynamic anc initializer inputs
     return dynamics, initializers
 
@@ -137,9 +135,7 @@ def apply(self, model):
                     # As Add is not a join node, there must be one initializer
                     # and one dynamic input. We do not know their order, but
                     # can group them accordingly to extract the tensor names
-                    (start,), (add_weight, ) = group_inputs_by_category(
-                        n, model
-                    )
+                    (start,), (add_weight,) = group_inputs_by_category(n, model)
                     threshold = consumer.input[1]
                     A = model.get_initializer(add_weight)
                     T = model.get_initializer(threshold)
@@ -236,7 +232,7 @@ def apply(self, model):
         graph_modified = False
         for n in graph.node:
             node_ind += 1
-            if n.op_type == "Mul":
+            if n.op_type == "Mul" and not model.is_join_node(n):
                 mul_weight_name = n.input[1]
                 A = model.get_initializer(mul_weight_name)
                 assert A is not None, "Initializer for mul weights is not set."

From de9791105aea1b1bb77eb2ac641af417abe30dcb Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Thu, 8 Aug 2024 16:09:32 +0200
Subject: [PATCH 23/26] [Streamline] Delete initializer datatype annotation
 after MoveAddPastMul

This is probably just a workaround and proper datatype inference should
be implemented later. For now it seems more safe to implicitly treat the
resulting parameter tensor as floating-point than assuming a wrong
datatype. In most cases the resulting Add operation will later be
absorbed and rounded into some thresholds anyway.
---
 src/finn/transformation/streamline/reorder.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 1d6001a381..4cfc4cfff7 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -91,6 +91,9 @@ def apply(self, model):
                     graph.node.insert(node_ind + 1, new_add)
                     # replace add value
                     model.set_initializer(add_weight_name, BA)
+                    # Delete the datatype annotation of the parameter tensor
+                    # TODO: Maybe we should derive the new type properly...
+                    model.set_tensor_datatype(add_weight_name, None)
                     # Delete the shape annotation of the connecting tensors
                     # to be re-done later. This prevents shapes from propagating
                     # backwards.
@@ -105,6 +108,9 @@ def apply(self, model):
         # Note: Running shape inference is necessary as shape
         # annotations have been deleted above
         model = model.transform(InferShapes())
+        # Note. Running datatype inference is necessary as datatype
+        # annotations have been deleted above
+        model = model.transform(InferDataTypes())
         return model, graph_modified
 
 

From dd680787824c2d070f0dfe90e8a30d8f70a79832 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Wed, 28 Aug 2024 14:51:17 +0200
Subject: [PATCH 24/26] [Elementwise] Reintroduce FIFO depths attribute
 overloads

---
 src/finn/custom_op/fpgadataflow/elementwise_binary.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index ac96e649aa..ad204f416e 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -92,6 +92,10 @@ def get_nodeattr_types(self):
             "ram_style": (
                 "s", False, "auto", {"auto", "block", "distributed", "ultra"}
             ),
+            # Input and output FIFO depths for multi-I/O nodes
+            #   Note: Need to override here as there might be two inputs
+            "inFIFODepths": ("ints", False, [2, 2]),
+            "outFIFODepths": ("ints", False, [2]),
         })
         # Return updated attribute dictionary
         return attrs

From 2501f58c541aa7e5dc523b21c87061773a5e7fc7 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Tue, 28 Jan 2025 14:03:50 +0100
Subject: [PATCH 25/26] [Thresholding] Remove second offset left in due to
 merge conflict

---
 src/finn/custom_op/fpgadataflow/thresholding.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index cda2412617..d386305c9c 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -257,9 +257,6 @@ def execute_node(self, context, graph):
         if act == DataType["BIPOLAR"]:
             # binary to bipolar
             y = 2 * y - 1
-        else:
-            # signed offset
-            y += act.min()
         context[node.output[0]] = y.astype(np.float32)
 
     def calc_tmem(self):

From 57625f6f799c6e10f45db22aba9cb2e07dd2a482 Mon Sep 17 00:00:00 2001
From: Christoph Berganski <christoph.berganski@gmail.com>
Date: Tue, 28 Jan 2025 14:52:29 +0100
Subject: [PATCH 26/26] [Deps] flatten is now part of finn-hlslib but we do not
 have ap_float

---
 custom_hls/flatten.hpp                        | 47 -------------------
 fetch-repos.sh                                |  4 +-
 .../hls/elementwise_binary_hls.py             |  2 +-
 3 files changed, 3 insertions(+), 50 deletions(-)
 delete mode 100644 custom_hls/flatten.hpp

diff --git a/custom_hls/flatten.hpp b/custom_hls/flatten.hpp
deleted file mode 100644
index ccb15d5af6..0000000000
--- a/custom_hls/flatten.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef FLATTEN_HPP
-#define FLATTEN_HPP
-
-// HLS arbitrary precision types
-#include <ap_int.h>
-
-// Flattens an array of N elements of Type into a single bitvector
-template<long unsigned N, class Type>
-    ap_uint<N * Type::width> flatten(const Type buffer[N]) {
-// Inline this small piece of bit merging logic
-#pragma HLS INLINE
-        // Fill a flat word of N times the bit-width of the element type
-        ap_uint<N * Type::width> flat;
-        // Merge all N chunks of the tile into the flat bitvector
-        for(unsigned j = 0; j < N; ++j) {
-// Do the merging of all chunks in parallel
-#pragma HLS UNROLL
-            // Insert the chunk into the right place of the
-            // bitvector
-            flat((j + 1) * Type::width - 1, j * Type::width) = buffer[j];
-        }
-        // Return the buffer flattened into a single bitvector
-        return flat;
-    }
-
-// Flattens an array of N elements of float into a single bitvector
-template<long unsigned N>
-    ap_uint<N * 32> flatten(const float buffer[N]) {
-// Inline this small piece of bit merging logic
-#pragma HLS INLINE
-        // Fill a flat word of N times the bit-width of the element type
-        ap_uint<N * 32> flat;
-        // Merge all N chunks of the tile into the flat bitvector
-        for(unsigned j = 0; j < N; ++j) {
-// Do the merging of all chunks in parallel
-#pragma HLS UNROLL
-            // Insert the chunk into the right place of the
-            // bitvector
-            flat((j + 1) * 32 - 1, j * 32) =
-                // Note: Reinterpret the float as a 32-bit unsigned bit-vector
-                *reinterpret_cast<const ap_uint<32>*>(&buffer[j]);
-        }
-        // Return the buffer flattened into a single bitvector
-        return flat;
-    }
-
-#endif // FLATTEN_HPP
diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..1036c6b07a 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="bfc44f96763ecf6aded6e9fa8e48a1a3bfd8551a"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
@@ -45,7 +45,7 @@ FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"
 BREVITAS_URL="https://github.com/Xilinx/brevitas.git"
 PYVERILATOR_URL="https://github.com/maltanar/pyverilator.git"
 CNPY_URL="https://github.com/rogersce/cnpy.git"
-HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
+HLSLIB_URL="https://github.com/iksnagreb/finn-hlslib.git"
 OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
 AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
 XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
diff --git a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
index cffb964baf..a8ad6dd99a 100644
--- a/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/elementwise_binary_hls.py
@@ -486,7 +486,7 @@ def unpack_buffer(shape):
             """,
             # Write the PE group into the output stream
             f"""
-            out_{self.hls_sname()}.write(flatten<{self.pe}>(out));
+            out_{self.hls_sname()}.write(flatten(out));
             """,
             # Close all for-loop bodies of the generated nest
             *["}" for _ in enumerate(out_shape)]