From 312832ff89cf9c9a6b4d5e2a969ea8798b99e9c6 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 10:31:04 +0200
Subject: [PATCH 001/103] Initial commit

---
 hls4ml/backends/__init__.py                   |   4 +
 hls4ml/backends/vitis_accelerator/__init__.py |   0
 .../vitis_accelerator/passes/__init__.py      |   0
 .../passes/fifo_depth_optimization.py         |  69 ++++++++
 .../vitis_accelerator/supported_boards.json   |  42 +++++
 .../vitis_accelerator_backend.py              | 163 ++++++++++++++++++
 .../vitis_accelerator_config.py               | 162 +++++++++++++++++
 7 files changed, 440 insertions(+)
 create mode 100644 hls4ml/backends/vitis_accelerator/__init__.py
 create mode 100644 hls4ml/backends/vitis_accelerator/passes/__init__.py
 create mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
 create mode 100644 hls4ml/backends/vitis_accelerator/supported_boards.json
 create mode 100644 hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
 create mode 100644 hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 6396d7815f..91a9272e74 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -2,14 +2,18 @@
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
+
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig  # noqa: F401
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
+register_backend('VitisAccelerator', VitisAcceleratorBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator/__init__.py b/hls4ml/backends/vitis_accelerator/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator/passes/__init__.py b/hls4ml/backends/vitis_accelerator/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
new file mode 100644
index 0000000000..e983ca49fb
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
@@ -0,0 +1,69 @@
+# from hls4ml.backends.vivado.passes.fifo_depth_optimization import (
+#     generate_max_depth_file,
+#     get_vcd_data,
+#     populate_values,
+#     set_big_fifos,
+#     set_fifo_depth,
+# )
+# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+#     def __init__(self):
+#         self.values = []
+
+#     def transform(self, model):
+#         # use `large_fifo_depth = 0` to keep the default fifo depth
+#         profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000)
+
+#         # check axi-stream or io-stream, if not one the 2 exit
+#         if not (
+#             model.config.get_config_value('IOType') == 'io_stream'
+#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream'
+#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master'
+#         ):
+#             raise Exception(
+#                 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config '
+#                 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field'
+#             )
+
+#         # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be
+#         # profiled
+
+#         if profiling_fifo_depth:
+#             set_big_fifos(model.output_vars, profiling_fifo_depth)
+
+#         data = get_vcd_data(model)
+
+#         for i in range(1, len(data['children'][0]['children'][0]['children'])):
+#             # wrapper fifos
+#             populate_values(
+#                 self.values,
+#                 data['children'][0]['children'][0]['children'][i]['name'],
+#                 data['children'][0]['children'][0]['children'][i]['children'][0]['data'],
+#                 data['children'][0]['children'][0]['children'][i]['children'][1]['data'],
+#             )
+
+#         n_elem = len(data['children'][0]['children'][0]['children'][0]['children'])
+#         for i in range(n_elem):
+#             name = data['children'][0]['children'][0]['children'][0]['children'][i]['name']
+#             data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data']
+#             depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data']
+#             populate_values(self.values, name, data_p, depth)
+
+#         maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values]
+
+#         generate_max_depth_file(model, maxs)
+
+#         set_fifo_depth(model, maxs)
+
+#         inp = model.get_input_variables()[0]
+#         out = model.get_output_variables()[0]
+#         for x in maxs:
+#             if 'in_local' in x['name']:
+#                 inp.pragma = (inp.pragma[0], x['max'] + 1)
+#             elif 'out_local' in x['name']:
+#                 out.pragma = (out.pragma[0], x['max'] + 1)
+
+#         print('[hls4ml] - FIFO optimization completed')
+#         return False
diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json
new file mode 100644
index 0000000000..1279ec22d0
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator/supported_boards.json
@@ -0,0 +1,42 @@
+{
+  "pynq-z2": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "alveo-u50": {
+    "part": "xcu50-fsvh2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u250": {
+    "part": "xcu250-figd2104-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u200": {
+    "part": "xcu200-fsgd2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u280": {
+    "part": "xcu280-fsvh2892-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  }
+}
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
new file mode 100644
index 0000000000..ccd9521269
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -0,0 +1,163 @@
+import os
+
+from hls4ml.backends import VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VivadoAcceleratorBackend(VivadoBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='VivadoAccelerator')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=True,
+        synth=True,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=False,
+        bitfile=False,
+    ):
+        # run the VivadoBackend build
+        super().build(
+            model,
+            reset=reset,
+            csim=csim,
+            synth=synth,
+            cosim=cosim,
+            validation=validation,
+            export=export,
+            vsynth=vsynth,
+            fifo_opt=fifo_opt,
+        )
+        # Get Config to view Board and Platform
+        from hls4ml.backends import VivadoAcceleratorConfig
+
+        vivado_accelerator_config = VivadoAcceleratorConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+        # now make a bitfile
+        if bitfile:
+            if vivado_accelerator_config.get_board().startswith('alveo'):
+                self.make_xclbin(model, vivado_accelerator_config.get_platform())
+            else:
+                curr_dir = os.getcwd()
+                os.chdir(model.config.get_output_dir())
+                try:
+                    os.system('vivado -mode batch -source design.tcl')
+                except Exception:
+                    print("Something went wrong, check the Vivado logs")
+                os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
+
+    def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'):
+        """Create the xclbin for the given model and target platform.
+
+        Args:
+            model (ModelGraph): Compiled and build model.
+            platform (str, optional): Development/Deployment target platform, must be installed first.
+                The host machine only requires the deployment target platform. Refer to the Getting Started section of
+                the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'.
+        """
+        curr_dir = os.getcwd()
+        abs_path_dir = os.path.abspath(model.config.get_output_dir())
+        os.chdir(abs_path_dir)
+        os.makedirs('xo_files', exist_ok=True)
+        try:
+            os.system('vivado -mode batch -source design.tcl')
+        except Exception:
+            print("Something went wrong, check the Vivado logs")
+        project_name = model.config.get_project_name()
+        ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip'
+        os.makedirs('xclbin_files', exist_ok=True)
+        os.chdir(abs_path_dir + '/xclbin_files')
+        # TODO Add other platforms
+        vitis_cmd = (
+            "v++ -t hw --platform "
+            + platform
+            + " --link ../xo_files/"
+            + project_name
+            + "_kernel.xo -o'"
+            + project_name
+            + "_kernel.xclbin' --user_ip_repo_paths "
+            + ip_repo_path
+        )
+        try:
+            os.system(vitis_cmd)
+        except Exception:
+            print("Something went wrong, check the Vitis/Vivado logs")
+        os.chdir(curr_dir)
+
+    def create_initial_config(
+        self,
+        board='pynq-z2',
+        part=None,
+        clock_period=5,
+        clock_uncertainty='12.5%',
+        io_type='io_parallel',
+        interface='axi_stream',
+        driver='python',
+        input_type='float',
+        output_type='float',
+        platform='xilinx_u250_xdma_201830_2',
+    ):
+        '''
+        Create initial accelerator config with default parameters
+
+        Args:
+            board: one of the keys defined in supported_boards.json
+            clock_period: clock period passed to hls project
+            io_type: io_parallel or io_stream
+            interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels.
+                       `axi_master`: generate hardware designs and drivers which exploit axi master channels.
+                       `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it
+                       to exchange large amount of data)
+            driver: `python`: generates the python driver to use the accelerator in the PYNQ stack.
+                    `c`: generates the c driver to use the accelerator bare-metal.
+            input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend
+                             will round the number of bits used to the next power-of-2 value.
+            output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
+                              VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+            platform: development target platform
+
+        Returns:
+            populated config
+        '''
+        board = board if board is not None else 'pynq-z2'
+        config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type)
+        config['AcceleratorConfig'] = {}
+        config['AcceleratorConfig']['Board'] = board
+        config['AcceleratorConfig']['Interface'] = interface  # axi_stream, axi_master, axi_lite
+        config['AcceleratorConfig']['Driver'] = driver
+        config['AcceleratorConfig']['Precision'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = {}
+        config['AcceleratorConfig']['Precision']['Output'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
+        config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
+        if board.startswith('alveo'):
+            config['AcceleratorConfig']['Platform'] = platform
+
+        return config
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def _register_flows(self):
+        vivado_ip = 'vivado:ip'
+        writer_passes = ['make_stamp', 'vivadoaccelerator:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
+        self._default_flow = vivado_ip
+
+        fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
+
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
new file mode 100644
index 0000000000..7bd931ede3
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -0,0 +1,162 @@
+import json
+import os
+
+import numpy as np
+
+from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
+
+
+class VivadoAcceleratorConfig:
+    def __init__(self, config, model_inputs, model_outputs):
+        self.config = config.config
+        self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')
+        self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json'))
+        if self.board in self.supported_boards.keys():
+            board_info = self.supported_boards[self.board]
+            self.part = board_info['part']
+        else:
+            raise Exception('The board does not appear in supported_boards.json file')
+
+        if self.config.get('Part') is not None:
+            if self.config.get('Part') != self.part:
+                print(
+                    'WARNING: You set a Part that does not correspond to the Board you specified. The correct '
+                    'Part is now set.'
+                )
+                self.config['Part'] = self.part
+        accel_config = self.config.get('AcceleratorConfig', None)
+        if accel_config is not None:
+            prec = accel_config.get('Precision')
+            if prec is None:
+                raise Exception('Precision must be provided in the AcceleratorConfig')
+            else:
+                if prec.get('Input') is None or prec.get('Output') is None:
+                    raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision')
+        else:
+            accel_config = {
+                'Precision': {'Input': 'float', 'Output': 'float'},
+                'Driver': 'python',
+                'Interface': 'axi_stream',
+            }
+            config.config['AcceleratorConfig'] = accel_config
+
+        self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream')  # axi_stream, axi_master, axi_lite
+        self.driver = self.config['AcceleratorConfig'].get('Driver', 'python')  # python or c
+        self.input_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Input', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.output_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Output', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.platform = self.config['AcceleratorConfig'].get(
+            'Platform', 'xilinx_u250_xdma_201830_2'
+        )  # Get platform folder name
+
+        assert (
+            len(model_inputs) == 1
+        ), "Only models with one input tensor are currently supported by VivadoAcceleratorBackend"
+        assert (
+            len(model_outputs) == 1
+        ), "Only models with one output tensor are currently supported by VivadoAcceleratorBackend"
+        self.inp = model_inputs[0]
+        self.out = model_outputs[0]
+        inp_axi_t = self.input_type
+        out_axi_t = self.output_type
+
+        if inp_axi_t not in ['float', 'double']:
+            self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t))
+        if out_axi_t not in ['float', 'double']:
+            self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t))
+
+        if self.input_type == 'float':
+            self.input_bitwidth = 32
+        elif self.input_type == 'double':
+            self.input_bitwidth = 64
+        else:
+            self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width
+
+        if out_axi_t == 'float':
+            self.output_bitwidth = 32
+        elif out_axi_t == 'double':
+            self.output_bitwidth = 64
+        else:
+            self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width
+
+    def _next_factor8_type(self, p):
+        '''Return a new type with the width rounded to the next factor of 8 up to p's width
+        Args:
+            p : IntegerPrecisionType or FixedPrecisionType
+        Returns:
+            An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8
+            of p's width. Other parameters (fractional bits, extra modes) stay the same.
+        '''
+        W = p.width
+        newW = int(np.ceil(W / 8) * 8)
+        if isinstance(p, FixedPrecisionType):
+            return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits)
+        elif isinstance(p, IntegerPrecisionType):
+            return IntegerPrecisionType(newW, p.signed)
+
+    def get_io_bitwidth(self):
+        return self.input_bitwidth, self.output_bitwidth
+
+    def get_corrected_types(self):
+        return self.input_type, self.output_type, self.inp, self.out
+
+    def get_interface(self):
+        return self.interface
+
+    def get_board_info(self, board=None):
+        if board is None:
+            board = self.board
+        if board in self.supported_boards.keys():
+            return self.supported_boards[board]
+        else:
+            raise Exception('The board is still not supported')
+
+    def get_part(self):
+        return self.part
+
+    def get_driver(self):
+        return self.driver
+
+    def get_board(self):
+        return self.board
+
+    def get_platform(self):
+        return self.platform
+
+    def get_clock_period(self):
+        return self.clock_period
+
+    def get_driver_path(self):
+        if self.board.startswith('alveo'):
+            return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+        else:
+            return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
+
+    def get_driver_file(self):
+        driver_ext = '.py' if self.driver == 'python' else '.h'
+        return self.interface + '_driver' + driver_ext
+
+    def get_krnl_rtl_src_dir(self):
+        return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src'
+
+    def get_input_type(self):
+        return self.input_type
+
+    def get_output_type(self):
+        return self.output_type
+
+    def get_tcl_file_path(self):
+        board_info = self.get_board_info(self.board)
+        tcl_scripts = board_info.get('tcl_scripts', None)
+        if tcl_scripts is None:
+            raise Exception('No tcl scripts definition available for the board in supported_board.json')
+        tcl_script = tcl_scripts.get(self.interface, None)
+        if tcl_script is None:
+            raise Exception('No tcl script definition available for the desired interface in supported_board.json')
+        if self.board.startswith('alveo'):
+            return '../templates/vivado_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+        else:
+            return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script

From d2b5a15bcbd37c04529989bf434c19f031fcc03b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 11:13:21 +0200
Subject: [PATCH 002/103] Set change the backend

---
 .../passes/fifo_depth_optimization.py         |  69 -----------
 .../vitis_accelerator/supported_boards.json   |  34 ------
 .../vitis_accelerator_backend.py              | 110 +++++++++---------
 .../vitis_accelerator_config.py               |   2 +-
 4 files changed, 56 insertions(+), 159 deletions(-)
 delete mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py

diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
deleted file mode 100644
index e983ca49fb..0000000000
--- a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# from hls4ml.backends.vivado.passes.fifo_depth_optimization import (
-#     generate_max_depth_file,
-#     get_vcd_data,
-#     populate_values,
-#     set_big_fifos,
-#     set_fifo_depth,
-# )
-# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
-
-
-# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
-#     def __init__(self):
-#         self.values = []
-
-#     def transform(self, model):
-#         # use `large_fifo_depth = 0` to keep the default fifo depth
-#         profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000)
-
-#         # check axi-stream or io-stream, if not one the 2 exit
-#         if not (
-#             model.config.get_config_value('IOType') == 'io_stream'
-#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream'
-#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master'
-#         ):
-#             raise Exception(
-#                 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config '
-#                 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field'
-#             )
-
-#         # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be
-#         # profiled
-
-#         if profiling_fifo_depth:
-#             set_big_fifos(model.output_vars, profiling_fifo_depth)
-
-#         data = get_vcd_data(model)
-
-#         for i in range(1, len(data['children'][0]['children'][0]['children'])):
-#             # wrapper fifos
-#             populate_values(
-#                 self.values,
-#                 data['children'][0]['children'][0]['children'][i]['name'],
-#                 data['children'][0]['children'][0]['children'][i]['children'][0]['data'],
-#                 data['children'][0]['children'][0]['children'][i]['children'][1]['data'],
-#             )
-
-#         n_elem = len(data['children'][0]['children'][0]['children'][0]['children'])
-#         for i in range(n_elem):
-#             name = data['children'][0]['children'][0]['children'][0]['children'][i]['name']
-#             data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data']
-#             depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data']
-#             populate_values(self.values, name, data_p, depth)
-
-#         maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values]
-
-#         generate_max_depth_file(model, maxs)
-
-#         set_fifo_depth(model, maxs)
-
-#         inp = model.get_input_variables()[0]
-#         out = model.get_output_variables()[0]
-#         for x in maxs:
-#             if 'in_local' in x['name']:
-#                 inp.pragma = (inp.pragma[0], x['max'] + 1)
-#             elif 'out_local' in x['name']:
-#                 out.pragma = (out.pragma[0], x['max'] + 1)
-
-#         print('[hls4ml] - FIFO optimization completed')
-#         return False
diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json
index 1279ec22d0..5f44560ccd 100644
--- a/hls4ml/backends/vitis_accelerator/supported_boards.json
+++ b/hls4ml/backends/vitis_accelerator/supported_boards.json
@@ -4,39 +4,5 @@
     "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
     "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
     "c_drivers": {}
-  },
-  "zcu102": {
-    "part": "xczu9eg-ffvb1156-2-e",
-    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
-  },
-  "alveo-u50": {
-    "part": "xcu50-fsvh2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u250": {
-    "part": "xcu250-figd2104-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u200": {
-    "part": "xcu200-fsgd2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u280": {
-    "part": "xcu280-fsvh2892-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
   }
 }
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
index ccd9521269..4c54e05328 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -1,13 +1,13 @@
 import os
 
-from hls4ml.backends import VivadoBackend
+from hls4ml.backends import VitisBackend, VivadoBackend
 from hls4ml.model.flow import register_flow
 from hls4ml.report import parse_vivado_report
 
 
-class VivadoAcceleratorBackend(VivadoBackend):
+class VitisAcceleratorBackend(VitisBackend):
     def __init__(self):
-        super(VivadoBackend, self).__init__(name='VivadoAccelerator')
+        super(VivadoBackend, self).__init__(name='VitisAccelerator')
         self._register_layer_attributes()
         self._register_flows()
 
@@ -24,7 +24,7 @@ def build(
         fifo_opt=False,
         bitfile=False,
     ):
-        # run the VivadoBackend build
+        # run the VitisBackend build
         super().build(
             model,
             reset=reset,
@@ -37,63 +37,63 @@ def build(
             fifo_opt=fifo_opt,
         )
         # Get Config to view Board and Platform
-        from hls4ml.backends import VivadoAcceleratorConfig
+        from hls4ml.backends import VitisAcceleratorConfig
 
-        vivado_accelerator_config = VivadoAcceleratorConfig(
+        vitis_accelerator_config = VitisAcceleratorConfig(
             model.config, model.get_input_variables(), model.get_output_variables()
         )
         # now make a bitfile
         if bitfile:
-            if vivado_accelerator_config.get_board().startswith('alveo'):
-                self.make_xclbin(model, vivado_accelerator_config.get_platform())
-            else:
-                curr_dir = os.getcwd()
-                os.chdir(model.config.get_output_dir())
-                try:
-                    os.system('vivado -mode batch -source design.tcl')
-                except Exception:
-                    print("Something went wrong, check the Vivado logs")
-                os.chdir(curr_dir)
+            # if vitis_accelerator_config.get_board().startswith('alveo'):
+            #     self.make_xclbin(model, vitis_accelerator_config.get_platform())
+            # else:
+            curr_dir = os.getcwd()
+            os.chdir(model.config.get_output_dir())
+            try:
+                os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command
+            except Exception:
+                print("Something went wrong, check the Vivado logs") 
+            os.chdir(curr_dir)
 
         return parse_vivado_report(model.config.get_output_dir())
 
-    def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'):
-        """Create the xclbin for the given model and target platform.
-
-        Args:
-            model (ModelGraph): Compiled and build model.
-            platform (str, optional): Development/Deployment target platform, must be installed first.
-                The host machine only requires the deployment target platform. Refer to the Getting Started section of
-                the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'.
-        """
-        curr_dir = os.getcwd()
-        abs_path_dir = os.path.abspath(model.config.get_output_dir())
-        os.chdir(abs_path_dir)
-        os.makedirs('xo_files', exist_ok=True)
-        try:
-            os.system('vivado -mode batch -source design.tcl')
-        except Exception:
-            print("Something went wrong, check the Vivado logs")
-        project_name = model.config.get_project_name()
-        ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip'
-        os.makedirs('xclbin_files', exist_ok=True)
-        os.chdir(abs_path_dir + '/xclbin_files')
-        # TODO Add other platforms
-        vitis_cmd = (
-            "v++ -t hw --platform "
-            + platform
-            + " --link ../xo_files/"
-            + project_name
-            + "_kernel.xo -o'"
-            + project_name
-            + "_kernel.xclbin' --user_ip_repo_paths "
-            + ip_repo_path
-        )
-        try:
-            os.system(vitis_cmd)
-        except Exception:
-            print("Something went wrong, check the Vitis/Vivado logs")
-        os.chdir(curr_dir)
+    # def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'):
+    #     """Create the xclbin for the given model and target platform.
+
+    #     Args:
+    #         model (ModelGraph): Compiled and build model.
+    #         platform (str, optional): Development/Deployment target platform, must be installed first.
+    #             The host machine only requires the deployment target platform. Refer to the Getting Started section of
+    #             the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'.
+    #     """
+    #     curr_dir = os.getcwd()
+    #     abs_path_dir = os.path.abspath(model.config.get_output_dir())
+    #     os.chdir(abs_path_dir)
+    #     os.makedirs('xo_files', exist_ok=True)
+    #     try:
+    #         os.system('vivado -mode batch -source design.tcl')
+    #     except Exception:
+    #         print("Something went wrong, check the Vivado logs")
+    #     project_name = model.config.get_project_name()
+    #     ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip'
+    #     os.makedirs('xclbin_files', exist_ok=True)
+    #     os.chdir(abs_path_dir + '/xclbin_files')
+    #     # TODO Add other platforms
+    #     vitis_cmd = (
+    #         "v++ -t hw --platform "
+    #         + platform
+    #         + " --link ../xo_files/"
+    #         + project_name
+    #         + "_kernel.xo -o'"
+    #         + project_name
+    #         + "_kernel.xclbin' --user_ip_repo_paths "
+    #         + ip_repo_path
+    #     )
+    #     try:
+    #         os.system(vitis_cmd)
+    #     except Exception:
+    #         print("Something went wrong, check the Vitis/Vivado logs")
+    #     os.chdir(curr_dir)
 
     def create_initial_config(
         self,
@@ -141,8 +141,8 @@ def create_initial_config(
         config['AcceleratorConfig']['Precision']['Output'] = {}
         config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
         config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
-        if board.startswith('alveo'):
-            config['AcceleratorConfig']['Platform'] = platform
+        # if board.startswith('alveo'):
+        #     config['AcceleratorConfig']['Platform'] = platform
 
         return config
 
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
index 7bd931ede3..70429ef0d7 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -6,7 +6,7 @@
 from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
 
 
-class VivadoAcceleratorConfig:
+class VitisAcceleratorConfig:
     def __init__(self, config, model_inputs, model_outputs):
         self.config = config.config
         self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')

From 02659dd11e0f63c5d0b2d2c57ce7c371aebd99d8 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 11:19:18 +0200
Subject: [PATCH 003/103] Change the accelerator config script

---
 .../vitis_accelerator/vitis_accelerator_config.py  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
index 70429ef0d7..b0bf4e894b 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
@@ -54,10 +54,10 @@ def __init__(self, config, model_inputs, model_outputs):
 
         assert (
             len(model_inputs) == 1
-        ), "Only models with one input tensor are currently supported by VivadoAcceleratorBackend"
+        ), "Only models with one input tensor are currently supported by VitisAcceleratorBackend"
         assert (
             len(model_outputs) == 1
-        ), "Only models with one output tensor are currently supported by VivadoAcceleratorBackend"
+        ), "Only models with one output tensor are currently supported by VitisAcceleratorBackend"
         self.inp = model_inputs[0]
         self.out = model_outputs[0]
         inp_axi_t = self.input_type
@@ -131,16 +131,16 @@ def get_clock_period(self):
 
     def get_driver_path(self):
         if self.board.startswith('alveo'):
-            return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+            return '../templates/vitis_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
         else:
-            return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
+            return '../templates/vitis_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
 
     def get_driver_file(self):
         driver_ext = '.py' if self.driver == 'python' else '.h'
         return self.interface + '_driver' + driver_ext
 
     def get_krnl_rtl_src_dir(self):
-        return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src'
+        return '../templates/vitis_accelerator/' + 'alveo/' + '/krnl_rtl_src'
 
     def get_input_type(self):
         return self.input_type
@@ -157,6 +157,6 @@ def get_tcl_file_path(self):
         if tcl_script is None:
             raise Exception('No tcl script definition available for the desired interface in supported_board.json')
         if self.board.startswith('alveo'):
-            return '../templates/vivado_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+            return '../templates/vitis_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script
         else:
-            return '../templates/vivado_accelerator/' + self.board + '/tcl_scripts/' + tcl_script
+            return '../templates/vitis_accelerator/' + self.board + '/tcl_scripts/' + tcl_script

From 56296b6734462efc0cf0c4421abe875e4016cb52 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 14:22:53 +0200
Subject: [PATCH 004/103] Set the vitis accelerator template

---
 .../templates/vitis_accelerator/build_lib.sh  | 17 +++++
 .../vitis_accelerator/myproject_axi.cpp       | 14 ++++
 .../vitis_accelerator/myproject_axi.h         | 10 +++
 .../python_drivers/axi_stream_driver.py       | 75 +++++++++++++++++++
 .../pynq-z2/tcl_scripts/axi_lite_design.tcl   | 26 +++++++
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 59 +++++++++++++++
 .../python_drivers/axi_stream_driver.py       | 75 +++++++++++++++++++
 .../zcu102/tcl_scripts/axi_stream_design.tcl  | 58 ++++++++++++++
 8 files changed, 334 insertions(+)
 create mode 100644 hls4ml/templates/vitis_accelerator/build_lib.sh
 create mode 100644 hls4ml/templates/vitis_accelerator/myproject_axi.cpp
 create mode 100644 hls4ml/templates/vitis_accelerator/myproject_axi.h
 create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py
 create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
 create mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
 create mode 100644 hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py
 create mode 100644 hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl

diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator/build_lib.sh
new file mode 100644
index 0000000000..69a2bace57
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/build_lib.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+INCFLAGS="-Ifirmware/ap_types/"
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp
new file mode 100644
index 0000000000..05797f1f7b
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp
@@ -0,0 +1,14 @@
+// hls-fpga-machine-learning insert include
+
+void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) {
+
+    // hls-fpga-machine-learning insert interface
+
+    // hls-fpga-machine-learning insert local vars
+
+    // hls-fpga-machine-learning insert enqueue
+
+    // hls-fpga-machine-learning insert call
+
+    // hls-fpga-machine-learning insert dequeue
+}
diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator/myproject_axi.h
new file mode 100644
index 0000000000..a60dab39c4
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/myproject_axi.h
@@ -0,0 +1,10 @@
+#ifndef MYPROJECT_AXI_H_
+#define MYPROJECT_AXI_H_
+
+#include <iostream>
+// hls-fpga-machine-learning insert include
+
+// hls-fpga-machine-learning insert definitions
+
+void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]);
+#endif
diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1aac79f2d3
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(
+        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+    ):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
new file mode 100644
index 0000000000..4d23da26cc
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
@@ -0,0 +1,26 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..aa06e8a6d2
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,59 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1aac79f2d3
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(
+        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+    ):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..5d886c6f25
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,58 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages

From 7dd01737d79856ce790d78e6ab50597a4ac74131 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 14:41:21 +0200
Subject: [PATCH 005/103] Set vitis accelerator writer

---
 hls4ml/writer/__init__.py                 |   4 +-
 hls4ml/writer/vitis_accelerator_writer.py | 431 ++++++++++++++++++++++
 2 files changed, 434 insertions(+), 1 deletion(-)
 create mode 100644 hls4ml/writer/vitis_accelerator_writer.py

diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index f4eed945a1..b97ce99884 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,6 +1,7 @@
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
-from hls4ml.writer.vitis_writer import VitisWriter
+from hls4ml.writer.vitis_writer import VitisWrite
+from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.writers import Writer, get_writer, register_writer  # noqa: F401
@@ -8,5 +9,6 @@
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
+register_writer('VitisAccelerator', VitisAcceleratorWriter)
 register_writer('Quartus', QuartusWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
new file mode 100644
index 0000000000..fed95905e2
--- /dev/null
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -0,0 +1,431 @@
+import os
+from distutils.dir_util import copy_tree
+from shutil import copyfile
+
+# from hls4ml.writer.vivado_writer import VivadoWriter
+from hls4ml.writer.vitis_writer import VitisWriter
+
+
+class VitisAcceleratorWriter(VitisWriter):
+    def __init__(self):
+        super().__init__()
+        self.vitis_accelerator_config = None
+
+    def write_axi_wrapper(self, model):
+        '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
+        Args:
+            model : The ModelGraph to write the wrapper for
+        '''
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types()
+        indent = '    '
+
+        #######################
+        # myproject_axi.h
+        #######################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
+
+        for line in f.readlines():
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}.h"\n'
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert definitions' in line:
+                newline = ''
+                newline += f'static const unsigned N_IN = {inp.size()};\n'
+                newline += f'static const unsigned N_OUT = {out.size()};\n'
+                if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    newline += f'typedef {inp_axi_t} T_in;\n'
+                    newline += f'typedef {out_axi_t} T_out;\n'
+                    newline += (
+                        'typedef struct in_struct {\n'
+                        + indent
+                        + 'T_in data;\n'
+                        + indent
+                        + 'ap_uint<1> last;\n'
+                        + indent
+                        + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                        + indent
+                        + 'in_struct(){this->data = 0; this->last = 0;};\n'
+                        + indent
+                        + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n'
+                        + indent
+                        + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n'
+                        + indent
+                        + 'operator float() const {return this->data;}\n'
+                        + indent
+                        + 'operator double() const {return this->data;}\n'
+                        + indent
+                        + 'in_struct(float data) {this->data = data; this->last = 0;}\n'
+                        + indent
+                        + 'in_struct(double data) {this->data = data; this->last = 0;}\n'
+                        + '} input_axi_t;\n'
+                    )
+                    newline += (
+                        'typedef struct out_struct {\n'
+                        + indent
+                        + 'T_out data;\n'
+                        + indent
+                        + 'ap_uint<1> last;\n'
+                        + indent
+                        + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                        + indent
+                        + 'out_struct(){this->data = 0; this->last = 0;};\n'
+                        + indent
+                        + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n'
+                        + indent
+                        + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n'
+                        + indent
+                        + 'operator float() const {return this->data;}\n'
+                        + indent
+                        + 'operator double() const {return this->data;}\n'
+                        + indent
+                        + 'out_struct(float data) {this->data = data; this->last = 0;}\n'
+                        + indent
+                        + 'out_struct(double data) {this->data = data; this->last = 0;}\n'
+                        + '} output_axi_t;\n'
+                    )
+                else:
+                    newline += f'typedef {inp_axi_t} input_axi_t;\n'
+                    newline += f'typedef {out_axi_t} output_axi_t;\n'
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+        #######################
+        # myproject_axi.cpp
+        #######################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
+
+        io_type = model.config.get_config_value("IOType")
+
+        for line in f.readlines():
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
+            elif '// hls-fpga-machine-learning insert local vars' in line:
+                newline = ''
+                if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    newline += indent + 'bool is_last = false;\n'
+                if io_type == 'io_parallel':
+                    newline += indent + inp.type.name + ' in_local[N_IN];\n'
+                    newline += indent + out.type.name + ' out_local[N_OUT];\n'
+                elif io_type == 'io_stream':
+                    newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
+                    newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
+                    newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+            elif '// hls-fpga-machine-learning insert call' in line:
+                newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
+            elif '// hls-fpga-machine-learning insert interface' in line:
+                if self.vitis_accelerator_config.get_interface() == 'axi_lite':
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
+                elif self.vitis_accelerator_config.get_interface() == 'axi_master':
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+                elif self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE axis port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE axis port=out\n'
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    if model.config.get_config_value("IOType") == 'io_stream':
+                        newline += indent + '#pragma HLS DATAFLOW\n'
+            elif '// hls-fpga-machine-learning insert enqueue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
+                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
+                        newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n'
+                    newline += indent + '}\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n'
+                    newline += indent + indent + '{input_t} ctype;\n'
+                    newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+                        )
+                        newline += (
+                            indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
+                        )
+                    else:
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n'
+                        )
+                    newline += indent + indent + '}}\n'
+                    newline += indent + indent + 'in_local.write(ctype);\n'
+                    newline += indent + '}}\n'
+                    newline = newline.format(input_t=inp.type.name)
+            elif '// hls-fpga-machine-learning insert dequeue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
+                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n'
+                        newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
+                    newline += indent + '}\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n'
+                    newline += indent + indent + '{result_t} ctype = out_local.read();\n'
+                    newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
+                        )
+                        newline += (
+                            indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n'
+                        )
+                    else:
+                        newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
+                    newline += indent + indent + '}}\n'
+                    newline += indent + '}}\n'
+                    newline = newline.format(result_t=out.type.name)
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def modify_build_script(self, model):
+        '''
+        Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        oldfile = f'{model.config.get_output_dir()}/build_prj.tcl'
+        newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl'
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if 'set_top' in line:
+                newline = line[:-1] + '_axi\n'  # remove the newline from the line end and append _axi for the new top
+                newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n'
+            elif f'{model.config.get_project_name()}_cosim' in line:
+                newline = line.replace(
+                    f'{model.config.get_project_name()}_cosim',
+                    f'{model.config.get_project_name()}_axi_cosim',
+                )
+            elif '${project_name}.tcl' in line:
+                newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl')
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # build_lib.sh
+        ###################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator/build_lib.sh'))
+        fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
+
+        for line in f.readlines():
+            line = line.replace('myproject', model.config.get_project_name())
+            line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+            fout.write(line)
+        f.close()
+        fout.close()
+
+    def write_wrapper_test(self, model):
+        ###################
+        # write myproject_test_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp() in line:
+                newline = line.replace(
+                    inp.definition_cpp(), 'input_axi_t inputs[N_IN]'
+                )  # TODO instead of replacing strings, how about we use proper variables and their definition?
+            elif out.definition_cpp() in line:
+                newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]')
+            elif 'unsigned short' in line:
+                newline = ''
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = (
+                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t')
+                )
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = (
+                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t')
+                )
+            else:
+                newline = line
+            if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                if 'nnet::fill_zero' in line:
+                    indent = line.split('n')[0]
+                    newline = indent + 'inputs[N_IN-1].last = 1;\n'
+                if 'copy_data' in line:
+                    newline = newline.replace('copy_data', 'copy_data_axi')
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # write myproject_bridge_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp'
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]')
+            elif out.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]')
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
+                    model.config.get_project_name(), inp.name, out.name
+                )
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t')
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'output_axi_t')
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+    def write_board_script(self, model):
+        '''
+        Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAccelerator
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_config.get_tcl_file_path()),
+            f'{model.config.get_output_dir()}/design.tcl',
+        )
+        # Generic alveo board
+        if self.vitis_accelerator_config.get_board().startswith('alveo'):
+            src_dir = os.path.join(filedir, self.vitis_accelerator_config.get_krnl_rtl_src_dir())
+            dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
+            copy_tree(src_dir, dst_dir)
+
+        ###################
+        # project.tcl
+        ###################
+        f = open(f'{model.config.get_output_dir()}/project.tcl', 'w')
+        f.write('variable project_name\n')
+        f.write(f'set project_name "{model.config.get_project_name()}"\n')
+        f.write('variable backend\n')
+        f.write('set backend "vitisaccelerator"\n')
+        f.write('variable part\n')
+        f.write(f'set part "{self.vitis_accelerator_config.get_part()}"\n')
+        f.write('variable clock_period\n')
+        f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
+        f.write('variable version\n')
+        f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+        if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+            in_bit, out_bit = self.vitis_accelerator_config.get_io_bitwidth()
+            f.write(f'set bit_width_hls_output {in_bit}\n')
+            f.write(f'set bit_width_hls_input {out_bit}\n')
+        f.close()
+
+    def write_driver(self, model):
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_config.get_driver_path()),
+            ('{}/' + self.vitis_accelerator_config.get_driver_file()).format(model.config.get_output_dir()),
+        )
+
+    def write_new_tar(self, model):
+        os.remove(model.config.get_output_dir() + '.tar.gz')
+        super().write_tar(model)
+
+    def write_hls(self, model):
+        """
+        Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAccelerator/AXI interface
+        """
+        # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+        from hls4ml.backends import VitisAcceleratorConfig
+
+        self.vitis_accelerator_config = VitisAcceleratorConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+        super().write_hls(model)
+        self.write_board_script(model)
+        self.write_driver(model)
+        self.write_wrapper_test(model)
+        self.write_axi_wrapper(model)
+        self.modify_build_script(model)
+        self.write_new_tar(model)

From 6f181b8f2d20ec941fca65f373cf5658991880a6 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 14:42:02 +0200
Subject: [PATCH 006/103] Fix writes init

---
 hls4ml/writer/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index b97ce99884..759a7115b1 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,6 +1,6 @@
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
-from hls4ml.writer.vitis_writer import VitisWrite
+from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter

From bd2e52e4951c8b9eda866518153254131701dfe1 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 10 May 2024 10:33:27 +0200
Subject: [PATCH 007/103] Include separable convolution resource implementation

---
 .../vitis_accelerator_backend.py              |  2 +-
 .../vivado/nnet_utils/nnet_sepconv_stream.h   | 84 +++++++++++++++++--
 hls4ml/writer/vitis_accelerator_writer.py     |  4 +-
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
index 4c54e05328..85a6d02f2c 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -34,7 +34,7 @@ def build(
             validation=validation,
             export=export,
             vsynth=vsynth,
-            fifo_opt=fifo_opt,
+            # fifo_opt=fifo_opt,
         )
         # Get Config to view Board and Platform
         from hls4ml.backends import VitisAcceleratorConfig
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index 9c16de1908..93532292d6 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -4,11 +4,77 @@
 #include "hls_stream.h"
 #include "nnet_common.h"
 #include "nnet_conv_stream.h"
+#include <iostream>
 
 namespace nnet {
 
 template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+                    
+    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
+    const int nout = CONFIG_T::n_chan;
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
+    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
+    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
+    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
+    // const int multscale = multiplier_limit;
+
+    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
+    // std::cout << sizeof(CONFIG_T::n_chan) << std::endl;
+
+InitAccum:  
+    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+
+int out_index = 0;
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int in_index = ir;
+        // int w_index = ir;
+        // int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            out_index = ((in_index % CONFIG_T::n_chan));
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
+
+
+            in_index+=rufactor;
+            
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < nout; ires++) {
+        // #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
     #pragma HLS INLINE
@@ -78,9 +144,9 @@ void depthwise_mult_buffer(hls::stream<typename data_T::value_type> data_window[
 
     #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
-        depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+        depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
     } else {
-        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+        depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
     }
 
 CastLoop:
@@ -202,10 +268,11 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T
         // Dense multiply
         #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
-            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+            depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
                                                                                                  weights, biases);
         } else {
-            assert("Resource strategy for DepthwiseConv1D is not supported." && false);
+            depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
         }
 
     // Pack output
@@ -267,10 +334,11 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
         // Dense multiply
         #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
-            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+            depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
                                                                                                  weights, biases);
         } else {
-            assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+            depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
         }
 
     // Pack output
@@ -303,4 +371,4 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
 }
 
 } // namespace nnet
-#endif
+#endif
\ No newline at end of file
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index fed95905e2..c29f917882 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -163,7 +163,7 @@ def write_axi_wrapper(self, model):
                         newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n'
                     else:
                         newline += indent + indent + '#pragma HLS UNROLL\n'
-                        newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n'
+                        newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
                     newline = ''
@@ -188,7 +188,7 @@ def write_axi_wrapper(self, model):
                             indent
                             + indent
                             + indent
-                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j]);\n'
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
                         )
                     newline += indent + indent + '}}\n'
                     newline += indent + indent + 'in_local.write(ctype);\n'

From b79524062ea0ad776fda50f31a7e5698f016c585 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 29 May 2024 10:45:42 +0200
Subject: [PATCH 008/103] Separate depthwise resource strategy to 3 cases

---
 .../vivado/nnet_utils/nnet_sepconv_stream.h   | 156 +++++++++++++++++-
 1 file changed, 151 insertions(+), 5 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index 93532292d6..8d8ff9712e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -9,7 +9,7 @@
 namespace nnet {
 
 template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
                     
@@ -34,7 +34,7 @@ void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_
 
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
     #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    // std::cout << sizeof(CONFIG_T::n_chan) << std::endl;
+    std::cout << "LEQ IMPLE" << std::endl;
 
 InitAccum:  
     for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
@@ -42,6 +42,72 @@ void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_
         acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
     }
 
+int out_index = 0;
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int in_index = ir;
+        out_index = in_index % CONFIG_T::n_chan;
+        // int w_index = ir;
+        // int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
+
+            in_index+=rufactor;
+
+            out_index+=rufactor;
+            out_index -= ((out_index) >= CONFIG_T::n_chan)*CONFIG_T::n_chan;
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < nout; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+                    
+    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
+    const int nout = CONFIG_T::n_chan;
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
+    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
+    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
+    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
+    // const int multscale = multiplier_limit;
+
+    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
+    std::cout << "REM0 IMPLE" << std::endl;
+
+InitAccum:  
+    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
 
 int out_index = 0;
 
@@ -56,23 +122,87 @@ int out_index = 0;
     MultLoop:
         for (int im = 0; im < block_factor; im++) {
             #pragma HLS UNROLL
-            out_index = ((in_index % CONFIG_T::n_chan));
+            
             acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
 
+            in_index+=rufactor;         
+        }
+        out_index++;
+        out_index -= ((out_index) == CONFIG_T::n_chan)*CONFIG_T::n_chan;
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < nout; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+                    
+    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
+    const int nout = CONFIG_T::n_chan;
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
+    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
+    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
+    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
+    // const int multscale = multiplier_limit;
+
+    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
+
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
+    std::cout << "GT IMPLE" << std::endl;
+
+InitAccum:  
+    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+int out_index = 0;
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int in_index = ir;
+        // int w_index = ir;
+        // int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            out_index = in_index % CONFIG_T::n_chan;
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
 
             in_index+=rufactor;
-            
         }
     }
 
 // Cast to "res_t" type
 Result:
     for (int ires = 0; ires < nout; ires++) {
-        // #pragma HLS UNROLL
+        #pragma HLS UNROLL
         res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
     }
 }
 
+
 template <class data_T, class res_T, typename CONFIG_T>
 void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
@@ -124,6 +254,22 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor < CONFIG_T::n_chan) {
+        depthwise_product_resource_rf_leq_nchan<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_chan == 0) {
+        depthwise_product_resource_rf_gt_nchan_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        depthwise_product_resource_rf_gt_nchan<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
 template <class data_T, class res_T, typename CONFIG_T>
 void depthwise_mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                            res_T &res_pack, hls::stream<res_T> &res_stream, unsigned &outputs_ready,

From eeb04d4a9a842914181d5461b45297fa45a447a8 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 29 May 2024 11:47:50 +0200
Subject: [PATCH 009/103] Complete vitis accelerator wrapper for io_stream case

---
 .../vitis_accelerator/myproject_axi.cpp       |   2 +-
 .../vitis_accelerator/myproject_axi.h         |   2 +-
 hls4ml/writer/vitis_accelerator_writer.py     | 161 ++++++++++--------
 3 files changed, 94 insertions(+), 71 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp
index 05797f1f7b..01238643ed 100644
--- a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp
+++ b/hls4ml/templates/vitis_accelerator/myproject_axi.cpp
@@ -1,6 +1,6 @@
 // hls-fpga-machine-learning insert include
 
-void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]) {
+void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out) {
 
     // hls-fpga-machine-learning insert interface
 
diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator/myproject_axi.h
index a60dab39c4..d49f98ba14 100644
--- a/hls4ml/templates/vitis_accelerator/myproject_axi.h
+++ b/hls4ml/templates/vitis_accelerator/myproject_axi.h
@@ -6,5 +6,5 @@
 
 // hls-fpga-machine-learning insert definitions
 
-void myproject_axi(input_axi_t in[N_IN], output_axi_t out[N_OUT]);
+void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out);
 #endif
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index c29f917882..a2270fb610 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -32,6 +32,7 @@ def write_axi_wrapper(self, model):
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
             elif '// hls-fpga-machine-learning insert include' in line:
                 newline = f'#include "{model.config.get_project_name()}.h"\n'
+                newline = '#include "ap_axi_sdata.h'
             elif 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
             elif '// hls-fpga-machine-learning insert definitions' in line:
@@ -39,57 +40,58 @@ def write_axi_wrapper(self, model):
                 newline += f'static const unsigned N_IN = {inp.size()};\n'
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vitis_accelerator_config.get_interface() == 'axi_stream':
-                    newline += f'typedef {inp_axi_t} T_in;\n'
-                    newline += f'typedef {out_axi_t} T_out;\n'
-                    newline += (
-                        'typedef struct in_struct {\n'
-                        + indent
-                        + 'T_in data;\n'
-                        + indent
-                        + 'ap_uint<1> last;\n'
-                        + indent
-                        + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
-                        + indent
-                        + 'in_struct(){this->data = 0; this->last = 0;};\n'
-                        + indent
-                        + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n'
-                        + indent
-                        + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n'
-                        + indent
-                        + 'operator float() const {return this->data;}\n'
-                        + indent
-                        + 'operator double() const {return this->data;}\n'
-                        + indent
-                        + 'in_struct(float data) {this->data = data; this->last = 0;}\n'
-                        + indent
-                        + 'in_struct(double data) {this->data = data; this->last = 0;}\n'
-                        + '} input_axi_t;\n'
-                    )
-                    newline += (
-                        'typedef struct out_struct {\n'
-                        + indent
-                        + 'T_out data;\n'
-                        + indent
-                        + 'ap_uint<1> last;\n'
-                        + indent
-                        + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
-                        + indent
-                        + 'out_struct(){this->data = 0; this->last = 0;};\n'
-                        + indent
-                        + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n'
-                        + indent
-                        + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n'
-                        + indent
-                        + 'operator float() const {return this->data;}\n'
-                        + indent
-                        + 'operator double() const {return this->data;}\n'
-                        + indent
-                        + 'out_struct(float data) {this->data = data; this->last = 0;}\n'
-                        + indent
-                        + 'out_struct(double data) {this->data = data; this->last = 0;}\n'
-                        + '} output_axi_t;\n'
-                    )
-                else:
+                    newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n'
+                    # newline += f'typedef {inp_axi_t} T_in;\n'
+                    # newline += f'typedef {out_axi_t} T_out;\n'
+                    # newline += (
+                    #     'typedef struct in_struct {\n'
+                    #     + indent
+                    #     + 'T_in data;\n'
+                    #     + indent
+                    #     + 'ap_uint<1> last;\n'
+                    #     + indent
+                    #     + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                    #     + indent
+                    #     + 'in_struct(){this->data = 0; this->last = 0;};\n'
+                    #     + indent
+                    #     + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n'
+                    #     + indent
+                    #     + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n'
+                    #     + indent
+                    #     + 'operator float() const {return this->data;}\n'
+                    #     + indent
+                    #     + 'operator double() const {return this->data;}\n'
+                    #     + indent
+                    #     + 'in_struct(float data) {this->data = data; this->last = 0;}\n'
+                    #     + indent
+                    #     + 'in_struct(double data) {this->data = data; this->last = 0;}\n'
+                    #     + '} input_axi_t;\n'
+                    # )
+                    # newline += (
+                    #     'typedef struct out_struct {\n'
+                    #     + indent
+                    #     + 'T_out data;\n'
+                    #     + indent
+                    #     + 'ap_uint<1> last;\n'
+                    #     + indent
+                    #     + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
+                    #     + indent
+                    #     + 'out_struct(){this->data = 0; this->last = 0;};\n'
+                    #     + indent
+                    #     + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n'
+                    #     + indent
+                    #     + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n'
+                    #     + indent
+                    #     + 'operator float() const {return this->data;}\n'
+                    #     + indent
+                    #     + 'operator double() const {return this->data;}\n'
+                    #     + indent
+                    #     + 'out_struct(float data) {this->data = data; this->last = 0;}\n'
+                    #     + indent
+                    #     + 'out_struct(double data) {this->data = data; this->last = 0;}\n'
+                    #     + '} output_axi_t;\n'
+                    # )
+                else: # TODO: handle this case
                     newline += f'typedef {inp_axi_t} input_axi_t;\n'
                     newline += f'typedef {out_axi_t} output_axi_t;\n'
             else:
@@ -114,9 +116,9 @@ def write_axi_wrapper(self, model):
                 newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
             elif '// hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
-                if self.vitis_accelerator_config.get_interface() == 'axi_stream':
-                    newline += indent + 'bool is_last = false;\n'
-                if io_type == 'io_parallel':
+                # if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                #     newline += indent + 'bool is_last = false;\n'
+                if io_type == 'io_parallel': # TODO: handle io_parallel
                     newline += indent + inp.type.name + ' in_local[N_IN];\n'
                     newline += indent + out.type.name + ' out_local[N_OUT];\n'
                 elif io_type == 'io_stream':
@@ -131,12 +133,12 @@ def write_axi_wrapper(self, model):
             elif '// hls-fpga-machine-learning insert call' in line:
                 newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
             elif '// hls-fpga-machine-learning insert interface' in line:
-                if self.vitis_accelerator_config.get_interface() == 'axi_lite':
+                if self.vitis_accelerator_config.get_interface() == 'axi_lite': # TODO: handle axi_lite
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
-                elif self.vitis_accelerator_config.get_interface() == 'axi_master':
+                elif self.vitis_accelerator_config.get_interface() == 'axi_master': # TODO: handle axi_master
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
                     newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
@@ -154,7 +156,7 @@ def write_axi_wrapper(self, model):
                         newline += indent + '#pragma HLS DATAFLOW\n'
             elif '// hls-fpga-machine-learning insert enqueue' in line:
                 io_type = model.config.get_config_value("IOType")
-                if io_type == 'io_parallel':
+                if io_type == 'io_parallel': # TODO: handle io_parallel
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
                     if self.vitis_accelerator_config.get_interface() == 'axi_stream':
@@ -166,24 +168,37 @@ def write_axi_wrapper(self, model):
                         newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'my_pkt tmp_a;\n'
+
+                    newline = ''
+                    newline += indent + 'my_pkt tmp_b;\n'
+
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
-                    # newline += indent + indent + '#pragma HLS PIPELINE\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
                     newline += indent + indent + '{input_t} ctype;\n'
-                    newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed
                     newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
-                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
                     if self.vitis_accelerator_config.get_interface() == 'axi_stream':
                         newline += (
                             indent
                             + indent
                             + indent
-                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+                            + 'in.read(tmp_a);\n'
                         )
                         newline += (
-                            indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = tmp_a.data;\n'
                         )
-                    else:
+                        # newline += (
+                        #     indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
+                        # )
+                    else: # TODO: handle this case
                         newline += (
                             indent
                             + indent
@@ -196,7 +211,7 @@ def write_axi_wrapper(self, model):
                     newline = newline.format(input_t=inp.type.name)
             elif '// hls-fpga-machine-learning insert dequeue' in line:
                 io_type = model.config.get_config_value("IOType")
-                if io_type == 'io_parallel':
+                if io_type == 'io_parallel':  # TODO: handle this case
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
                     if self.vitis_accelerator_config.get_interface() == 'axi_stream':
@@ -215,14 +230,22 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                        # newline += (
+                        #     indent
+                        #     + indent
+                        #     + indent
+                        #     + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
+                        # )
                         newline += (
-                            indent
-                            + indent
-                            + indent
-                            + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
+                            indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n'
                         )
+
+                        newline += (
+                            indent + indent + indent + 'if(tmp_a.last == 1) {tmp_b.last = (((i+1)*(j+1))==N_OUT);}\n'
+                        )
+
                         newline += (
-                            indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j], last);\n'
+                            indent + indent + indent + 'out.write(tmp_b);\n'
                         )
                     else:
                         newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'

From 7e47c859fd0ccf3030f99f84267df76cc2b9f343 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 31 May 2024 16:45:28 +0200
Subject: [PATCH 010/103] Fix call to wrong backend writer

---
 .../backends/vitis_accelerator/vitis_accelerator_backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
index 85a6d02f2c..2e3de9a1cd 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
+++ b/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
@@ -154,10 +154,10 @@ def get_writer_flow(self):
 
     def _register_flows(self):
         vivado_ip = 'vivado:ip'
-        writer_passes = ['make_stamp', 'vivadoaccelerator:write_hls']
+        writer_passes = ['make_stamp', 'vitisaccelerator:write_hls']
         self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
         self._default_flow = vivado_ip
 
-        fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
+        # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
 
-        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)
+        # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)

From 5a2a38fe56f425c22156659da7bd1508f5263a5c Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 31 May 2024 17:55:15 +0200
Subject: [PATCH 011/103] Fix vitis accelerator writer

---
 hls4ml/writer/vitis_accelerator_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index a2270fb610..382ff658ad 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -241,7 +241,7 @@ def write_axi_wrapper(self, model):
                         )
 
                         newline += (
-                            indent + indent + indent + 'if(tmp_a.last == 1) {tmp_b.last = (((i+1)*(j+1))==N_OUT);}\n'
+                            indent + indent + indent + 'if(tmp_a.last == 1) {{tmp_b.last = (((i+1)*(j+1))==N_OUT);}}\n'
                         )
 
                         newline += (

From 99f9429f8aef1b47d73897a48cef4a2b688fb0d6 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 11 Jun 2024 11:16:02 +0200
Subject: [PATCH 012/103] Fix include in axi wrapper header file writer

---
 .../templates/vivado/ap_types/ap_axi_sdata.h  | 441 ++++++++++++++++++
 .../vivado/nnet_utils/nnet_sepconv_stream.h   |   6 +-
 hls4ml/writer/vitis_accelerator_writer.py     |   4 +-
 3 files changed, 445 insertions(+), 6 deletions(-)
 create mode 100755 hls4ml/templates/vivado/ap_types/ap_axi_sdata.h

diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
new file mode 100755
index 0000000000..e01c8a8cd1
--- /dev/null
+++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
@@ -0,0 +1,441 @@
+// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689
+/*****************************************************************************
+ *
+ *     Author: Xilinx, Inc.
+ *
+ *     This text contains proprietary, confidential information of
+ *     Xilinx, Inc. , is distributed by under license from Xilinx,
+ *     Inc., and may be used, copied and/or disclosed only pursuant to
+ *     the terms of a valid license agreement with Xilinx, Inc.
+ *
+ *     XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS"
+ *     AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND
+ *     SOLUTIONS FOR XILINX DEVICES.  BY PROVIDING THIS DESIGN, CODE,
+ *     OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE,
+ *     APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION
+ *     THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT,
+ *     AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE
+ *     FOR YOUR IMPLEMENTATION.  XILINX EXPRESSLY DISCLAIMS ANY
+ *     WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE
+ *     IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR
+ *     REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF
+ *     INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE.
+ *
+ *     Xilinx products are not intended for use in life support appliances,
+ *     devices, or systems. Use in such applications is expressly prohibited.
+ *
+#-  (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+ *
+ *****************************************************************************/
+
+/*
+ * This file contains the definition of the data types for AXI streaming. 
+ * ap_axi_s is a signed interpretation of the AXI stream
+ * ap_axi_u is an unsigned interpretation of the AXI stream
+ */
+
+#ifndef __AP__AXI_SDATA__
+#define __AP__AXI_SDATA__
+
+#include <climits>
+#include "ap_int.h"
+//#include "ap_fixed.h"
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed;
+
+namespace hls {
+
+template <typename T> constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT;
+
+template <std::size_t W> constexpr std::size_t bitwidth<ap_int<W>> = W;
+template <std::size_t W> constexpr std::size_t bitwidth<ap_uint<W>> = W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+
+template <typename T>
+constexpr std::size_t bytewidth = (bitwidth<T> + CHAR_BIT - 1) / CHAR_BIT;
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest> struct axis {
+  static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser;
+  static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId;
+  static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest;
+  T data;
+  ap_uint<bytewidth<T>> keep;
+  ap_uint<bytewidth<T>> strb;
+  ap_uint<NewWUser> user;
+  ap_uint<1> last;
+  ap_uint<NewWId> id;
+  ap_uint<NewWDest> dest;
+
+  ap_uint<NewWUser> *get_user_ptr() { 
+#pragma HLS inline
+    return (WUser == 0) ? nullptr : &user;
+  }
+  ap_uint<NewWId> *get_id_ptr() {
+#pragma HLS inline
+    return (WId == 0) ? nullptr : &id;
+  }
+  ap_uint<NewWDest> *get_dest_ptr() {
+#pragma HLS inline
+    return (WDest == 0) ? nullptr : &dest;
+  }
+};
+
+} // namespace hls
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axis = hls::axis<ap_int<WData>, WUser, WId, WDest>;
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axiu = hls::axis<ap_uint<WData>, WUser, WId, WDest>;
+
+// Isolate out qdma_axis from hls::axis for special APIs.
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+struct qdma_axis;
+
+template <std::size_t WData> struct qdma_axis<WData, 0, 0, 0> {
+  //  private:
+  static constexpr std::size_t kBytes = (WData + 7) / 8;
+
+  ap_uint<WData> data;
+  ap_uint<kBytes> keep;
+  ap_uint<1> strb;
+  ap_uint<1> user;
+  ap_uint<1> last;
+  ap_uint<1> id;
+  ap_uint<1> dest;
+
+  ap_uint<1> *get_strb_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_user_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_id_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_dest_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+
+  //  public:
+  ap_uint<WData> get_data() const {
+#pragma HLS inline
+    return data;
+  }
+  ap_uint<kBytes> get_keep() const {
+#pragma HLS inline
+    return keep;
+  }
+  ap_uint<1> get_last() const {
+#pragma HLS inline
+    return last;
+  }
+
+  void set_data(const ap_uint<WData> &d) {
+#pragma HLS inline
+    data = d;
+  }
+  void set_keep(const ap_uint<kBytes> &k) {
+#pragma HLS inline
+    keep = k;
+  }
+  void set_last(const ap_uint<1> &l) {
+#pragma HLS inline
+    last = l;
+  }
+  void keep_all() {
+#pragma HLS inline
+    ap_uint<kBytes> k = 0;
+    keep = ~k;
+  }
+
+  qdma_axis() {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d) : data(d) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k) : data(d), keep(k) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k, ap_uint<1> l)
+      : data(d), keep(k), last(l) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(const qdma_axis<WData, 0, 0, 0> &d)
+      : data(d.data), keep(d.keep), last(d.last) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis &operator=(const qdma_axis<WData, 0, 0, 0> &d) {
+#pragma HLS inline
+    data = d.data;
+    keep = d.keep;
+    last = d.last;
+    return *this;
+  }
+};
+
+#ifdef AESL_SYN 
+#if ((__clang_major__ != 3) || (__clang_minor__ != 1))
+#include "hls_stream.h"
+namespace hls {
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest>
+class stream<axis<T, WUser, WId, WDest>> final {
+  typedef axis<T, WUser, WId, WDest> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+// specialization for qdma
+template <std::size_t WData>
+class stream<qdma_axis<WData, 0, 0, 0>> {
+  typedef qdma_axis<WData, 0, 0, 0> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                    &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                    &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+                    &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+} // namespace hls
+#endif
+#endif
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index 8d8ff9712e..462bf2571b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -34,7 +34,7 @@ void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size *
 
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
     #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    std::cout << "LEQ IMPLE" << std::endl;
+    // std::cout << "LEQ IMPLE" << std::endl;
 
 InitAccum:  
     for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
@@ -101,7 +101,7 @@ void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_si
 
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
     #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    std::cout << "REM0 IMPLE" << std::endl;
+    // std::cout << "REM0 IMPLE" << std::endl;
 
 InitAccum:  
     for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
@@ -165,7 +165,7 @@ void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size *
 
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
     #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    std::cout << "GT IMPLE" << std::endl;
+    // std::cout << "GT IMPLE" << std::endl;
 
 InitAccum:  
     for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index 382ff658ad..a6510d4733 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -32,7 +32,7 @@ def write_axi_wrapper(self, model):
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
             elif '// hls-fpga-machine-learning insert include' in line:
                 newline = f'#include "{model.config.get_project_name()}.h"\n'
-                newline = '#include "ap_axi_sdata.h'
+                newline += '#include "ap_axi_sdata.h"\n'
             elif 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
             elif '// hls-fpga-machine-learning insert definitions' in line:
@@ -171,10 +171,8 @@ def write_axi_wrapper(self, model):
                     newline = ''
                     newline += indent + 'my_pkt tmp_a;\n'
 
-                    newline = ''
                     newline += indent + 'my_pkt tmp_b;\n'
 
-                    newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
                     newline += indent + indent + '{input_t} ctype;\n'

From b9609dc5ed5636506be1cf02f708a1542e0bf158 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 11 Jun 2024 15:08:20 +0200
Subject: [PATCH 013/103] Change python-cpp bridge writer

---
 hls4ml/writer/vitis_accelerator_writer.py | 27 ++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index a6510d4733..650cf77100 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -318,10 +318,10 @@ def write_wrapper_test(self, model):
                 newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp() in line:
                 newline = line.replace(
-                    inp.definition_cpp(), 'input_axi_t inputs[N_IN]'
+                    inp.definition_cpp(), 'hls::stream< my_pkt > inputs'
                 )  # TODO instead of replacing strings, how about we use proper variables and their definition?
             elif out.definition_cpp() in line:
-                newline = line.replace(out.definition_cpp(), 'output_axi_t outputs[N_OUT]')
+                newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs')
             elif 'unsigned short' in line:
                 newline = ''
             elif f'{model.config.get_project_name()}(' in line:
@@ -329,11 +329,11 @@ def write_wrapper_test(self, model):
                 newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = (
-                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t')
+                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'hls::stream< my_pkt >')
                 )
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
                 newline = (
-                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'output_axi_t')
+                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'hls::stream< my_pkt >')
                 )
             else:
                 newline = line
@@ -365,18 +365,25 @@ def write_wrapper_test(self, model):
             if f'{model.config.get_project_name()}.h' in line:
                 newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'input_axi_t {inp.name}_ap[N_IN]')
+                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap')
             elif out.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'output_axi_t {out.name}_ap[N_OUT]')
+                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap')
             elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
                     model.config.get_project_name(), inp.name, out.name
                 )
-            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t')
-            elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'output_axi_t')
+            # elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+            #     newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'hls::stream< my_pkt >')
+            # elif out.size_cpp() in line or out.name in line or out.type.name in line:
+            #     newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'hls::stream< my_pkt >')
+
+            elif ("nnet::convert_data<float" in line) or ("nnet::convert_data<double" in line):
+                newline = f"for(unsigned i = 0; i < N_IN; ++i) {{my_pkt a; a.data = {inp.name}[i]; {inp.name}_ap.write(a);}}"
+            
+            elif f"nnet::convert_data<{out.type.name}" in line:
+                newline = f"for(unsigned i = 0; i < N_OUT; ++i) {{my_pkt a; {out.name}_ap.read(a); {inp.name}[i] = a.data;}}"
+            
             else:
                 newline = line
             fout.write(newline)

From 4f69c1697337bddd982e5a21e525d9c807b5d389 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 11 Jun 2024 15:13:27 +0200
Subject: [PATCH 014/103] Fix tlast handling in axis wrapper writer

---
 hls4ml/writer/vitis_accelerator_writer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index 650cf77100..76013fcf6e 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -206,6 +206,8 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + '}}\n'
                     newline += indent + indent + 'in_local.write(ctype);\n'
                     newline += indent + '}}\n'
+                    newline += indent + 'tmp_b = tmp_a;\n'
+                    newline += indent + 'tmp_b.last = 0;\n'
                     newline = newline.format(input_t=inp.type.name)
             elif '// hls-fpga-machine-learning insert dequeue' in line:
                 io_type = model.config.get_config_value("IOType")

From 014a7b2ec43a9218b3ec5c39d1aa6ea17b6f750f Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 11 Jun 2024 17:33:51 +0200
Subject: [PATCH 015/103] Extend convert_data to handle stream type, use that
 for the bridge

---
 .../templates/vivado/nnet_utils/nnet_helpers.h  | 16 ++++++++++++++++
 hls4ml/writer/vitis_accelerator_writer.py       | 17 ++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index b8c2a48d19..3938af347c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -2,6 +2,7 @@
 #define NNET_HELPERS_H
 
 #include "hls_stream.h"
+#include "ap_axi_sdata.h"
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -161,6 +162,21 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
     }
 }
 
+template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<dstType, 0, 0, 0>> &dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<dstType, 0, 0, 0> ctype;
+        ctype.data = dstType(src[i]);
+        dst.write(ctype);
+    }
+}
+
+template <typename srcType, class dstType, size_t SIZE> void convert_data(hls::stream<hls::axis<srcType, 0, 0, 0>> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<srcType, 0, 0, 0> ctype = src.read();
+        dst[i] = dstType(ctype.data);
+    }
+}
+
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
 extern size_t trace_type_size;
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index 76013fcf6e..cd9e349b4b 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -308,6 +308,8 @@ def write_wrapper_test(self, model):
         ###################
         oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
         newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
+        
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types()
 
         f = open(oldfile)
         fout = open(newfile, 'w')
@@ -375,17 +377,10 @@ def write_wrapper_test(self, model):
                 newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
                     model.config.get_project_name(), inp.name, out.name
                 )
-            # elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-            #     newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'hls::stream< my_pkt >')
-            # elif out.size_cpp() in line or out.name in line or out.type.name in line:
-            #     newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, 'hls::stream< my_pkt >')
-
-            elif ("nnet::convert_data<float" in line) or ("nnet::convert_data<double" in line):
-                newline = f"for(unsigned i = 0; i < N_IN; ++i) {{my_pkt a; a.data = {inp.name}[i]; {inp.name}_ap.write(a);}}"
-            
-            elif f"nnet::convert_data<{out.type.name}" in line:
-                newline = f"for(unsigned i = 0; i < N_OUT; ++i) {{my_pkt a; {out.name}_ap.read(a); {inp.name}[i] = a.data;}}"
-            
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t)
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)      
             else:
                 newline = line
             fout.write(newline)

From 723073ebe86a20b95dce2c3aba334a94a132e35b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 14 Jun 2024 10:55:07 +0200
Subject: [PATCH 016/103] Add zcu102 to the supported boards json

---
 .../vitis_accelerator/supported_boards.json   |  6 ++
 hls4ml/writer/vitis_accelerator_writer.py     | 56 -------------------
 2 files changed, 6 insertions(+), 56 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json
index 5f44560ccd..4a54ea2924 100644
--- a/hls4ml/backends/vitis_accelerator/supported_boards.json
+++ b/hls4ml/backends/vitis_accelerator/supported_boards.json
@@ -4,5 +4,11 @@
     "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
     "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
     "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
   }
 }
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index cd9e349b4b..70573bb5c2 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -41,56 +41,6 @@ def write_axi_wrapper(self, model):
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vitis_accelerator_config.get_interface() == 'axi_stream':
                     newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n'
-                    # newline += f'typedef {inp_axi_t} T_in;\n'
-                    # newline += f'typedef {out_axi_t} T_out;\n'
-                    # newline += (
-                    #     'typedef struct in_struct {\n'
-                    #     + indent
-                    #     + 'T_in data;\n'
-                    #     + indent
-                    #     + 'ap_uint<1> last;\n'
-                    #     + indent
-                    #     + 'in_struct(const T_in& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
-                    #     + indent
-                    #     + 'in_struct(){this->data = 0; this->last = 0;};\n'
-                    #     + indent
-                    #     + 'friend std::ostream& operator<<(std::ostream& stream, const in_struct& in)\n'
-                    #     + indent
-                    #     + '{ return stream << "{ data: " << in.data << ", last: " << in.last << " }" << std::endl; }\n'
-                    #     + indent
-                    #     + 'operator float() const {return this->data;}\n'
-                    #     + indent
-                    #     + 'operator double() const {return this->data;}\n'
-                    #     + indent
-                    #     + 'in_struct(float data) {this->data = data; this->last = 0;}\n'
-                    #     + indent
-                    #     + 'in_struct(double data) {this->data = data; this->last = 0;}\n'
-                    #     + '} input_axi_t;\n'
-                    # )
-                    # newline += (
-                    #     'typedef struct out_struct {\n'
-                    #     + indent
-                    #     + 'T_out data;\n'
-                    #     + indent
-                    #     + 'ap_uint<1> last;\n'
-                    #     + indent
-                    #     + 'out_struct(const T_out& data, const ap_uint<1>& last){this->data = data; this->last = last;};\n'
-                    #     + indent
-                    #     + 'out_struct(){this->data = 0; this->last = 0;};\n'
-                    #     + indent
-                    #     + 'friend std::ostream& operator<<(std::ostream& stream, const out_struct& out)\n'
-                    #     + indent
-                    #     + '{ return stream << "{ data: " << out.data << ", last: " << out.last << " }" << std::endl; }\n'
-                    #     + indent
-                    #     + 'operator float() const {return this->data;}\n'
-                    #     + indent
-                    #     + 'operator double() const {return this->data;}\n'
-                    #     + indent
-                    #     + 'out_struct(float data) {this->data = data; this->last = 0;}\n'
-                    #     + indent
-                    #     + 'out_struct(double data) {this->data = data; this->last = 0;}\n'
-                    #     + '} output_axi_t;\n'
-                    # )
                 else: # TODO: handle this case
                     newline += f'typedef {inp_axi_t} input_axi_t;\n'
                     newline += f'typedef {out_axi_t} output_axi_t;\n'
@@ -230,12 +180,6 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vitis_accelerator_config.get_interface() == 'axi_stream':
-                        # newline += (
-                        #     indent
-                        #     + indent
-                        #     + indent
-                        #     + 'bool last = (is_last && (i * {result_t}::size + j == N_OUT - 1)) ? true : false;\n'
-                        # )
                         newline += (
                             indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n'
                         )

From 290896b73cfaf35d941734640642a81e56d014f9 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 20 Jun 2024 14:40:08 +0200
Subject: [PATCH 017/103] Fix some c synthesis warnings

---
 hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h  | 2 +-
 hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h  | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h   | 4 +++-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
index 20b6fecb49..aad5d9a430 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
index a3747990e0..a119fb9e2a 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index 254fc5067b..ce097399c0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -106,7 +106,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index d56ed6d9a4..c4e0654890 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -130,7 +130,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index 462bf2571b..dea028d53b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -207,7 +207,7 @@ template <class data_T, class res_T, typename CONFIG_T>
 void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-    #pragma HLS INLINE
+    // #pragma HLS INLINE
 
     typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan];
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
@@ -239,8 +239,10 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c
 // Accumulate multiplication result
 Accum1:
     for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) {
+        // #pragma HLS PIPELINE II=1 rewind
     Accum2:
         for (int jj = 0; jj < CONFIG_T::n_chan; jj++) {
+            // #pragma HLS UNROLL
             int index = ii * CONFIG_T::n_chan + jj;
             acc[jj] += mult[index];
         }

From c9dfcf267395c8f4a6313175f554fce88ab8b973 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 12 Apr 2024 01:41:43 +0200
Subject: [PATCH 018/103] Group more tests per YAML to reduce the number of
 envs created

---
 .gitlab-ci.yml                  |  2 ++
 test/pytest/generate_ci_yaml.py | 19 ++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5a491d2f7b..a4aa6d507a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,6 +6,8 @@ stages:
 generator:
   stage: generate
   image: python:3.8-alpine
+  variables:
+    N_TESTS_PER_YAML: 5
   tags:
     - k8s-default
   before_script:
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 107cabdbbb..7a495267ab 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -1,4 +1,6 @@
 import glob
+import itertools
+import os
 
 import yaml
 
@@ -15,6 +17,14 @@
     EXAMPLEMODEL: {}
 """
 
+n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4))
+
+
+def batched(iterable, chunk_size):
+    iterator = iter(iterable)
+    while chunk := tuple(itertools.islice(iterator, chunk_size)):
+        yield chunk
+
 
 def uses_example_model(test_filename):
     with open(test_filename) as f:
@@ -24,9 +34,12 @@ def uses_example_model(test_filename):
 
 yml = None
 tests = glob.glob('test_*.py')
-for test in tests:
-    name = test.replace('test_', '').replace('.py', '')
-    new_yml = yaml.safe_load(template.format(name, f'test_{name}.py', int(uses_example_model(test))))
+for test_batch in batched(tests, n_test_files_per_yml):
+    name = '+'.join([test.replace('test_', '').replace('.py', '') for test in test_batch])
+    test_files = ' '.join(list(test_batch))
+    uses_example_models = int(any([uses_example_model(test) for test in test_batch]))
+
+    new_yml = yaml.safe_load(template.format(name, test_files, uses_example_models))
     if yml is None:
         yml = new_yml
     else:

From d3b8e20f9af537e4325619ccb3f3d619b3fe667b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 26 Mar 2024 00:22:41 +0100
Subject: [PATCH 019/103] Support negative_slope in quantized_relu

---
 hls4ml/converters/keras/qkeras.py |  4 ++++
 hls4ml/model/profiling.py         |  1 +
 hls4ml/utils/config.py            |  7 ++++--
 test/pytest/test_qkeras.py        | 38 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index a8038da46d..e610177196 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -166,6 +166,10 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
         layer['slope_prec'] = FixedPrecisionType(width=2, integer=0, signed=False)
         layer['shift_prec'] = FixedPrecisionType(width=2, integer=0, signed=False)
         layer['activation'] = activation_config['class_name'].replace('quantized_', 'hard_')
+    elif activation_config['class_name'] == 'quantized_relu' and activation_config['config']['negative_slope'] != 0:
+        layer['class_name'] = 'LeakyReLU'
+        layer['activation'] = activation_config['class_name'].replace('quantized_', 'leaky_')
+        layer['activ_param'] = activation_config['config']['negative_slope']
     else:
         layer['class_name'] = 'Activation'
         layer['activation'] = activation_config['class_name'].replace('quantized_', '')
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 904ecc3d35..7cdef74ff3 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -589,6 +589,7 @@ def get_ymodel_keras(keras_model, X):
         name = layer.name
         if (
             hasattr(layer, "activation")
+            and layer.activation is not None
             and layer.activation.__name__ != "linear"
             and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
         ):
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 5d7ca1ae72..7294dcf6fe 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -79,8 +79,11 @@ def _get_precision_from_quantizer(quantizer):
         rnd = "AP_RND_CONV"
         overflow = "AP_SAT"
         if quantizer['class_name'] in ('quantized_relu', 'quantized_relu_po2'):
-            signed = False
-            integer -= 1
+            if quantizer['config']['negative_slope'] != 0.0:
+                signed = True
+            else:
+                signed = False
+                integer -= 1
         elif quantizer['class_name'] == 'quantized_tanh':
             overflow = "AP_SAT_SYM" if quantizer['config']['symmetric'] else "AP_SAT"
             integer = 1
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 1812776684..61a2b15a4a 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -307,6 +307,44 @@ def test_quantizer(randX_1000_1, quantizer, backend, io_type):
     np.testing.assert_array_equal(y_qkeras, y_hls4ml)
 
 
+@pytest.mark.parametrize(
+    'quantizer',
+    [
+        (quantized_relu(4, negative_slope=0.5)),
+        (quantized_relu(8, 4, negative_slope=1.0)),
+        (quantized_relu(10, 2, negative_slope=0.25)),
+    ],
+)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type):
+    '''
+    Test a a transformation of quantized_relu with negative_slope to leaky_relu activation layer.
+    '''
+    X = randX_1000_1
+    X = -X # Make it negative so leaky relu does something
+    X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
+    model = Sequential()
+    model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer'))
+    model.compile()
+
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name')
+    output_dir = str(
+        test_root_path
+        / 'hls4mlprj_qkeras_leaky_relu_{}_{}_neg_slope_{}_{}_{}'.format(
+            quantizer.bits, quantizer.integer, quantizer.negative_slope, backend, io_type
+        )
+    )
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+
+    y_qkeras = model.predict(X)
+    y_hls4ml = hls_model.predict(X)
+    np.testing.assert_allclose(y_hls4ml, y_qkeras, rtol=1e-5, atol=0)
+
+
 @pytest.mark.parametrize(
     'weight_quantizer,activation_quantizer,',
     [

From b32984fa7265f501bd225f71c5b1798f21caba87 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Mar 2024 20:57:43 +0000
Subject: [PATCH 020/103] [pre-commit.ci] auto fixes from pre-commit hooks

---
 test/pytest/test_qkeras.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 61a2b15a4a..f068e4e503 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -322,7 +322,7 @@ def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type):
     Test a a transformation of quantized_relu with negative_slope to leaky_relu activation layer.
     '''
     X = randX_1000_1
-    X = -X # Make it negative so leaky relu does something
+    X = -X  # Make it negative so leaky relu does something
     X = np.round(X * 2**10) * 2**-10  # make it an exact ap_fixed<16,6>
     model = Sequential()
     model.add(QActivation(input_shape=(1,), activation=quantizer, name='quantizer'))

From 98273a044b11e8e17b122ab55056dbc8d444a5ae Mon Sep 17 00:00:00 2001
From: Vladimir <vloncar@users.noreply.github.com>
Date: Tue, 16 Apr 2024 21:30:43 +0200
Subject: [PATCH 021/103] Fix activation check in profiling

---
 hls4ml/model/profiling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 7cdef74ff3..84a83de23e 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -588,10 +588,10 @@ def get_ymodel_keras(keras_model, X):
         # Note that if the layer is a standalone activation layer then skip this
         name = layer.name
         if (
-            hasattr(layer, "activation")
+            hasattr(layer, 'activation')
             and layer.activation is not None
-            and layer.activation.__name__ != "linear"
             and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
+            and layer.activation.__name__ != 'linear'
         ):
             tmp_activation = layer.activation
             layer.activation = None

From 1640c4bcd2f73a0cf0ee9772cded4f863b82c703 Mon Sep 17 00:00:00 2001
From: dgburnette <36940078+dgburnette@users.noreply.github.com>
Date: Mon, 15 Apr 2024 07:12:17 -0700
Subject: [PATCH 022/103] Stage initial set of changes for the Catapult backend
 (#956)

* Stage initial set of changes for the Catapult backend

* applied some changes for issues reported by pre-commit. But pre-commit still reorders backends/__init__.py incorrectly

* final changes for clean pre-commit

* minor edits

* Checkin

* Add file

* pre-commit format

* add in nnet_utils files

* format changes for pre-commit

* run flows by netlist type

* update design pragmas on some blocks. cleaned up TCL script

* move AC submodules under hls4ml/templates/catapult

* merged in latest changes from mainline

* remove bad submodules

* recreate AC submodules in hls4ml/templates/catapult

* pre-commit fixes

* pre-commit fixes

* turn on Catapult backend testing

* removed io_parallel testing for Catapult backend

* add Catapult

* added Catapult

* added Catapult

* added Catapult to some pytests

* Added concept of ProjectDir to distinguish the project directory of the HLS tool from the ProjectName which is used for the cpp file and top function name

* better handling of c++ testbench data files. enhanced directory naming.

* fix syntax

* workaround from Giuseppe

* Add concept of ProjectDir for Catapult which is different from ProjectName that gets used for the top function name and the cpp files

* add new file from Giuseppe

* improvements to project management, reporting and testbench

* include new file in generation of parameters.h

* add hard_tanh for io_parallel. formatting

* Full path to the header nnet_helpers.h is necessary in the include (check if this is not an issue with recent versions of Catapult)

* Avoid ceiling function from the math library: ceil(n/d) ---> (n+d-1)/n

* These are mostly workarounds for the BUP synyhesis of a testing model (should these changes make in something more general?)

* revert format back to what clang-format currently enforces

* simplification from Giuesspe

* Fixes for bottom-up handling of libraries

* pre-commit format fixes

* fix loops

* consolidate prj scripts

* cleanup pragmas

* switch from using ssh to https for submodules

* fix include path for non-catapult install

* update pytest environment

* CL 1100381

* CL 1098112

* roll in latest changes. turn off Catapult variants of test_binary_cnn and test_cnn_mnist_qkeras for now

* fix test failure

* disable Catapult test for pytorch until it is supported

* disable Catapult for pytorch tests

* Simpler submodule initialization for CI

---------

Co-authored-by: David Burnette <hlslibs@mentor.com>
Co-authored-by: Giuseppe Di Guglielmo <gdg@fnal.gov>
Co-authored-by: Jovan Mitrevski <jmitrevs@fnal.gov>
Co-authored-by: Vladimir Loncar <vloncar@users.noreply.github.com>
---
 .gitmodules                                   |    9 +
 hls4ml/backends/__init__.py                   |    3 +
 hls4ml/backends/catapult/__init__.py          |    0
 hls4ml/backends/catapult/catapult_backend.py  |  515 ++++++++
 hls4ml/backends/catapult/passes/__init__.py   |    0
 .../catapult/passes/broadcast_stream.py       |  117 ++
 .../backends/catapult/passes/conv_same_pad.py |  109 ++
 .../backends/catapult/passes/conv_stream.py   |   52 +
 .../catapult/passes/convolution_templates.py  |  508 ++++++++
 .../catapult/passes/convolution_winograd.py   |  175 +++
 .../catapult/passes/core_templates.py         |  216 ++++
 .../passes/fifo_depth_optimization.py         |  104 ++
 .../catapult/passes/garnet_templates.py       |  249 ++++
 .../catapult/passes/merge_templates.py        |  106 ++
 hls4ml/backends/catapult/passes/pointwise.py  |   92 ++
 .../catapult/passes/pooling_templates.py      |  109 ++
 .../catapult/passes/quantization_templates.py |   36 +
 .../catapult/passes/recurrent_templates.py    |  175 +++
 .../catapult/passes/reshaping_templates.py    |  132 ++
 .../catapult/passes/resource_strategy.py      |   48 +
 .../catapult/passes/transform_types.py        |   52 +
 hls4ml/backends/fpga/fpga_types.py            |   65 +
 hls4ml/converters/__init__.py                 |    6 +-
 hls4ml/model/graph.py                         |    6 +
 hls4ml/model/layers.py                        |    1 +
 hls4ml/model/profiling.py                     |    5 +-
 hls4ml/report/__init__.py                     |    3 +
 hls4ml/report/catapult_report.py              |  256 ++++
 hls4ml/templates/catapult/ac_math             |    1 +
 hls4ml/templates/catapult/ac_simutils         |    1 +
 hls4ml/templates/catapult/ac_types            |    1 +
 hls4ml/templates/catapult/build_lib.sh        |   21 +
 hls4ml/templates/catapult/build_prj.tcl       |  356 ++++++
 hls4ml/templates/catapult/catapult_synth.tcl  |    3 +
 hls4ml/templates/catapult/firmware/defines.h  |   15 +
 .../templates/catapult/firmware/myproject.cpp |   29 +
 .../templates/catapult/firmware/myproject.h   |   15 +
 .../templates/catapult/firmware/parameters.h  |   15 +
 .../templates/catapult/myproject_bridge.cpp   |   72 ++
 hls4ml/templates/catapult/myproject_test.cpp  |  164 +++
 .../catapult/nnet_utils/ap_shift_reg.h        |  136 ++
 .../templates/catapult/nnet_utils/hls_math.h  |   24 +
 .../catapult/nnet_utils/nnet_activation.h     | 1107 +++++++++++++++++
 .../nnet_utils/nnet_activation_stream.h       |  922 ++++++++++++++
 .../catapult/nnet_utils/nnet_array.h          |   52 +
 .../catapult/nnet_utils/nnet_batchnorm.h      |  127 ++
 .../nnet_utils/nnet_batchnorm_stream.h        |  113 ++
 .../catapult/nnet_utils/nnet_code_gen.h       |   32 +
 .../catapult/nnet_utils/nnet_common.h         |   66 +
 .../catapult/nnet_utils/nnet_conv1d.h         |   62 +
 .../catapult/nnet_utils/nnet_conv1d_latency.h |  198 +++
 .../nnet_utils/nnet_conv1d_resource.h         |  241 ++++
 .../catapult/nnet_utils/nnet_conv1d_stream.h  |   94 ++
 .../catapult/nnet_utils/nnet_conv2d.h         |   84 ++
 .../catapult/nnet_utils/nnet_conv2d_latency.h |  392 ++++++
 .../nnet_utils/nnet_conv2d_resource.h         |  275 ++++
 .../catapult/nnet_utils/nnet_conv2d_stream.h  |  117 ++
 .../catapult/nnet_utils/nnet_conv_stream.h    |  398 ++++++
 .../catapult/nnet_utils/nnet_dense.h          |   49 +
 .../nnet_utils/nnet_dense_compressed.h        |  106 ++
 .../catapult/nnet_utils/nnet_dense_latency.h  |   92 ++
 .../catapult/nnet_utils/nnet_dense_resource.h |  262 ++++
 .../catapult/nnet_utils/nnet_dense_stream.h   |   72 ++
 .../catapult/nnet_utils/nnet_embed.h          |   47 +
 .../catapult/nnet_utils/nnet_embed_stream.h   |   34 +
 .../catapult/nnet_utils/nnet_garnet.h         |  816 ++++++++++++
 .../catapult/nnet_utils/nnet_helpers.h        |  461 +++++++
 .../catapult/nnet_utils/nnet_image.h          |   41 +
 .../catapult/nnet_utils/nnet_image_stream.h   |   66 +
 .../templates/catapult/nnet_utils/nnet_math.h |  178 +++
 .../catapult/nnet_utils/nnet_merge.h          |  232 ++++
 .../catapult/nnet_utils/nnet_merge_stream.h   |  380 ++++++
 .../templates/catapult/nnet_utils/nnet_mult.h |  127 ++
 .../catapult/nnet_utils/nnet_padding.h        |  145 +++
 .../catapult/nnet_utils/nnet_padding_stream.h |   95 ++
 .../catapult/nnet_utils/nnet_pooling.h        |  362 ++++++
 .../catapult/nnet_utils/nnet_pooling_stream.h |  601 +++++++++
 .../nnet_utils/nnet_recr_activations.h        |   56 +
 .../catapult/nnet_utils/nnet_recurrent.h      |  572 +++++++++
 .../nnet_utils/nnet_sepconv1d_stream.h        |  127 ++
 .../catapult/nnet_utils/nnet_sepconv2d.h      |   82 ++
 .../nnet_utils/nnet_sepconv2d_stream.h        |  152 +++
 .../catapult/nnet_utils/nnet_sepconv_stream.h |  315 +++++
 .../catapult/nnet_utils/nnet_stream.h         |  156 +++
 .../catapult/nnet_utils/nnet_types.h          |   64 +
 .../templates/vivado_accelerator/build_lib.sh |    0
 hls4ml/writer/__init__.py                     |    2 +
 hls4ml/writer/catapult_writer.py              |  929 ++++++++++++++
 test/pytest/ci-template.yml                   |    3 +-
 test/pytest/test_activations.py               |    2 +-
 test/pytest/test_batchnorm.py                 |    2 +-
 test/pytest/test_batchnorm_pytorch.py         |    2 +-
 test/pytest/test_clone_flatten.py             |    2 +-
 test/pytest/test_cnn_mnist.py                 |    2 +-
 test/pytest/test_conv1d.py                    |    4 +
 test/pytest/test_embed.py                     |    4 +-
 test/pytest/test_globalpooling.py             |    4 +-
 test/pytest/test_keras_h5_loader.py           |    2 +-
 test/pytest/test_keras_nested_model.py        |    4 +-
 test/pytest/test_pointwiseconv.py             |    4 +
 test/pytest/test_pooling.py                   |    4 +-
 test/pytest/test_repack_stream.py             |    4 +-
 test/pytest/test_reshape.py                   |    2 +-
 test/pytest/test_sepconv1d.py                 |    2 +-
 test/pytest/test_sepconv2d.py                 |    2 +-
 test/pytest/test_softmax.py                   |    4 +-
 test/pytest/test_softsign.py                  |    2 +-
 test/pytest/test_upsampling.py                |    2 +-
 test/pytest/test_zeropadding.py               |    2 +-
 109 files changed, 14932 insertions(+), 30 deletions(-)
 create mode 100644 hls4ml/backends/catapult/__init__.py
 create mode 100644 hls4ml/backends/catapult/catapult_backend.py
 create mode 100644 hls4ml/backends/catapult/passes/__init__.py
 create mode 100644 hls4ml/backends/catapult/passes/broadcast_stream.py
 create mode 100755 hls4ml/backends/catapult/passes/conv_same_pad.py
 create mode 100755 hls4ml/backends/catapult/passes/conv_stream.py
 create mode 100755 hls4ml/backends/catapult/passes/convolution_templates.py
 create mode 100644 hls4ml/backends/catapult/passes/convolution_winograd.py
 create mode 100755 hls4ml/backends/catapult/passes/core_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/fifo_depth_optimization.py
 create mode 100755 hls4ml/backends/catapult/passes/garnet_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/merge_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/pointwise.py
 create mode 100755 hls4ml/backends/catapult/passes/pooling_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/quantization_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/recurrent_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/reshaping_templates.py
 create mode 100755 hls4ml/backends/catapult/passes/resource_strategy.py
 create mode 100755 hls4ml/backends/catapult/passes/transform_types.py
 create mode 100755 hls4ml/report/catapult_report.py
 create mode 160000 hls4ml/templates/catapult/ac_math
 create mode 160000 hls4ml/templates/catapult/ac_simutils
 create mode 160000 hls4ml/templates/catapult/ac_types
 create mode 100755 hls4ml/templates/catapult/build_lib.sh
 create mode 100755 hls4ml/templates/catapult/build_prj.tcl
 create mode 100644 hls4ml/templates/catapult/catapult_synth.tcl
 create mode 100755 hls4ml/templates/catapult/firmware/defines.h
 create mode 100755 hls4ml/templates/catapult/firmware/myproject.cpp
 create mode 100755 hls4ml/templates/catapult/firmware/myproject.h
 create mode 100755 hls4ml/templates/catapult/firmware/parameters.h
 create mode 100755 hls4ml/templates/catapult/myproject_bridge.cpp
 create mode 100755 hls4ml/templates/catapult/myproject_test.cpp
 create mode 100644 hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/hls_math.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_activation.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_array.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_common.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_embed.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_garnet.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_helpers.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_image.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_math.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_merge.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_mult.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_padding.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_pooling.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h
 create mode 100755 hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_stream.h
 create mode 100644 hls4ml/templates/catapult/nnet_utils/nnet_types.h
 mode change 100644 => 100755 hls4ml/templates/vivado_accelerator/build_lib.sh
 create mode 100755 hls4ml/writer/catapult_writer.py

diff --git a/.gitmodules b/.gitmodules
index 3513213a23..98c3df68fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,12 @@
 [submodule "example-models"]
 	path = example-models
 	url = https://github.com/hls-fpga-machine-learning/example-models.git
+[submodule "hls4ml/templates/catapult/ac_types"]
+	path = hls4ml/templates/catapult/ac_types
+	url = https://github.com/hlslibs/ac_types.git
+[submodule "hls4ml/templates/catapult/ac_simutils"]
+	path = hls4ml/templates/catapult/ac_simutils
+	url = https://github.com/hlslibs/ac_simutils.git
+[submodule "hls4ml/templates/catapult/ac_math"]
+	path = hls4ml/templates/catapult/ac_math
+	url = https://github.com/hlslibs/ac_math.git
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 91a9272e74..f1eebd3c1f 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -7,6 +7,8 @@
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
+from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
+
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
 from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend  # isort: skip
 from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig  # noqa: F401
@@ -16,4 +18,5 @@
 register_backend('Vitis', VitisBackend)
 register_backend('VitisAccelerator', VitisAcceleratorBackend)
 register_backend('Quartus', QuartusBackend)
+register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/catapult/__init__.py b/hls4ml/backends/catapult/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
new file mode 100644
index 0000000000..5556154dcb
--- /dev/null
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -0,0 +1,515 @@
+import os
+import sys
+
+import numpy as np
+
+from hls4ml.backends import FPGABackend
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter
+from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute
+from hls4ml.model.flow import register_flow
+from hls4ml.model.layers import (
+    GRU,
+    LSTM,
+    Conv1D,
+    Conv2D,
+    Dense,
+    DepthwiseConv2D,
+    Embedding,
+    GarNet,
+    GarNetStack,
+    GlobalPooling1D,
+    GlobalPooling2D,
+    Layer,
+    Pooling1D,
+    Pooling2D,
+    SeparableConv1D,
+    SeparableConv2D,
+    SimpleRNN,
+    Softmax,
+)
+from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
+from hls4ml.report import parse_catapult_report
+from hls4ml.utils.fixed_point_utils import ceil_log2
+
+
+class CatapultBackend(FPGABackend):
+    def __init__(self):
+        super().__init__('Catapult')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_layer_attributes(self):
+        # Add RNN-specific attributes, recurrent_reuse_factor and static implementation
+        rnn_layers = [
+            SimpleRNN,
+            LSTM,
+            GRU,
+        ]
+
+        for layer in rnn_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
+            attrs.append(ConfigurableAttribute('table_size', default=1024))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            self.attribute_map[layer] = attrs
+
+        # Add ParallelizationFactor to Conv1D/2D
+        pf_layers = [
+            Conv1D,
+            Conv2D,
+        ]
+
+        for layer in pf_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            self.attribute_map[layer] = attrs
+
+        # Add ConvImplementation to Convolution+Pooling layers
+        cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D]
+
+        for layer in cnn_layers:
+            attrs = self.attribute_map.get(layer, [])
+            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
+            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            self.attribute_map[layer] = attrs
+
+        sep_conv_layers = [SeparableConv1D, SeparableConv2D]
+        for layer in sep_conv_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(TypeAttribute('dw_output', default=FixedPrecisionType(18, 8)))
+            self.attribute_map[layer] = attrs
+
+    def _register_flows(self):
+        initializers = self._get_layer_initializers()
+        init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
+
+        streaming_passes = [
+            'catapult:reshape_stream',
+            'catapult:clone_output',
+            'catapult:insert_zero_padding_before_conv1d',
+            'catapult:insert_zero_padding_before_conv2d',
+            'catapult:broadcast_stream',
+        ]
+        streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
+
+        quantization_passes = [
+            'catapult:merge_batch_norm_quantized_tanh',
+            'catapult:quantize_dense_output',
+            'fuse_consecutive_batch_normalization',
+            'catapult:xnor_pooling',
+        ]
+        quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
+
+        optimization_passes = [
+            'catapult:remove_final_reshape',
+            'catapult:optimize_pointwise_conv',
+            'catapult:inplace_parallel_reshape',
+            'catapult:inplace_stream_flatten',
+            'catapult:skip_softmax',
+            'catapult:fix_softmax_table_size',
+        ]
+        optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
+
+        catapult_types = [
+            'catapult:transform_types',
+            'catapult:register_bram_weights',
+            'catapult:generate_conv_streaming_instructions',
+            'catapult:apply_resource_strategy',
+            'catapult:generate_conv_im2col',
+        ]
+        catapult_types_flow = register_flow('specific_types', catapult_types, requires=[init_flow], backend=self.name)
+
+        templates = self._get_layer_templates()
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
+
+        writer_passes = ['make_stamp', 'catapult:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['catapult:ip'], backend=self.name)
+
+        fifo_depth_opt_passes = [
+            'catapult:fifo_depth_optimization'
+        ] + writer_passes  # After optimization, a new project will be written
+
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[self._writer_flow], backend=self.name)
+
+        all_passes = get_backend_passes(self.name)
+
+        extras = [
+            # Ideally this should be empty
+            opt_pass
+            for opt_pass in all_passes
+            if opt_pass
+            not in initializers
+            + streaming_passes
+            + quantization_passes
+            + optimization_passes
+            + catapult_types
+            + templates
+            + writer_passes
+            + fifo_depth_opt_passes
+        ]
+
+        if len(extras) > 0:
+            extras_flow = register_flow('extras', extras, requires=[init_flow], backend=self.name)
+        else:
+            extras_flow = None
+
+        ip_flow_requirements = [
+            'optimize',
+            init_flow,
+            streaming_flow,
+            quantization_flow,
+            optimization_flow,
+            catapult_types_flow,
+            extras_flow,
+            template_flow,
+        ]
+        ip_flow_requirements = list(filter(None, ip_flow_requirements))
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def create_initial_config(
+        self,
+        tech='fpga',
+        part='xcku115-flvb2104-2-i',
+        asiclibs='nangate-45nm',
+        fifo=None,
+        clock_period=5,
+        io_type='io_parallel',
+    ):
+        config = {}
+
+        config['Technology'] = tech
+        if tech == 'fpga':
+            config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e'
+        else:
+            config['ASICLibs'] = asiclibs if asiclibs is not None else 'nangate-45nm'
+        config['ClockPeriod'] = clock_period
+        config['FIFO'] = fifo
+        config['IOType'] = io_type
+        config['HLSConfig'] = {}
+
+        return config
+
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=True,
+        synth=True,
+        cosim=False,
+        validation=False,
+        vhdl=False,
+        verilog=True,
+        export=False,
+        vsynth=False,
+        fifo_opt=False,
+        bitfile=False,
+        ran_frame=5,
+        sw_opt=False,
+        power=False,
+        da=False,
+        bup=False,
+    ):
+        # print(f'ran_frame value: {ran_frame}')  # Add this line for debugging
+        catapult_exe = 'catapult'
+        if 'linux' in sys.platform:
+            cmd = 'command -v ' + catapult_exe + ' > /dev/null'
+            found = os.system(cmd)
+            if found != 0:
+                catapult_exe = os.getenv('MGC_HOME') + '/bin/catapult'
+                cmd = 'command -v ' + catapult_exe + ' > /dev/null'
+            found = os.system(cmd)
+            if found != 0:
+                catapult_exe = os.getenv('CATAPULT_HOME') + '/bin/catapult'
+                cmd = 'command -v ' + catapult_exe + ' > /dev/null'
+            if found != 0:
+                raise Exception('Catapult HLS installation not found. Make sure "catapult" is on PATH.')
+
+        curr_dir = os.getcwd()
+        # this execution moves into the hls4ml-generated "output_dir" and runs the build_prj.tcl script.
+        os.chdir(model.config.get_output_dir())
+        ccs_args = f'"reset={reset} csim={csim} synth={synth} cosim={cosim} validation={validation}'
+        ccs_args += f' export={export} vsynth={vsynth} fifo_opt={fifo_opt} bitfile={bitfile} ran_frame={ran_frame}'
+        ccs_args += f' sw_opt={sw_opt} power={power} da={da} vhdl={vhdl} verilog={verilog} bup={bup}"'
+        ccs_invoke = catapult_exe + ' -product ultra -shell -f build_prj.tcl -eval \'set ::argv ' + ccs_args + '\''
+        print(ccs_invoke)
+        os.system(ccs_invoke)
+        os.chdir(curr_dir)
+
+        return parse_catapult_report(model.config.get_output_dir())
+
+    def _validate_conv_strategy(self, layer):
+        if layer.model.config.pipeline_style.lower() != 'dataflow':
+            print(f'WARNING: Layer {layer.name} requires "dataflow" pipeline style. Switching to "dataflow" pipeline style.')
+            layer.model.config.pipeline_style = 'dataflow'
+
+    @layer_optimizer(Layer)
+    def init_base_layer(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('reuse_factor', reuse_factor)
+
+        target_cycles = layer.model.config.get_target_cycles(layer)
+        layer.set_attr('target_cycles', target_cycles)
+
+    @layer_optimizer(Dense)
+    def init_dense(self, layer):
+        index_t = IntegerPrecisionType(width=1, signed=False)
+        compression = layer.model.config.get_compression(layer)
+        if layer.model.config.is_resource_strategy(layer):
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            if compression:
+                layer.set_attr('strategy', 'compressed')
+                index_t = layer.get_weights('weight').type.index_precision
+            else:
+                layer.set_attr('strategy', 'resource')
+        else:
+            layer.set_attr('strategy', 'latency')
+        layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t))
+
+    # TODO consolidate these functions into a single `init_conv`
+    @layer_optimizer(Conv1D)
+    def init_conv1d(self, layer):
+        if len(layer.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv1D
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        if layer.model.config.is_resource_strategy(layer):
+            layer.set_attr('strategy', 'resource')
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        out_width = layer.get_output_variable().shape[0]
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(1, out_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            valid_pf_str = ','.join(map(str, valid_pf))
+            print(
+                f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
+                f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
+            )
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', out_width // closest_pf)
+
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+        self._validate_conv_strategy(layer)
+
+    @layer_optimizer(SeparableConv1D)
+    def init_sepconv1d(self, layer):
+        if layer.model.config.is_resource_strategy(layer):
+            layer.set_attr('strategy', 'resource')
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+        # Set the output type of the depthwise phase
+        dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output')
+        dw_out_name = layer.name + '_dw_out_t'
+        if layer.model.config.get_config_value('IOType') == 'io_stream':
+            dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1)
+        else:
+            dw_output_t = NamedType(dw_out_name, dw_out_precision)
+        layer.set_attr('dw_output_t', dw_output_t)
+
+    @layer_optimizer(Conv2D)
+    def init_conv2d(self, layer):
+        if len(layer.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
+            layer.weights['weight'].data = np.expand_dims(layer.weights['weight'].data, axis=(0, 1))
+
+        if layer.model.config.is_resource_strategy(layer):
+            layer.set_attr('strategy', 'resource')
+            self.set_target_reuse_factor(layer)
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        out_height = layer.get_output_variable().shape[0]
+        out_width = layer.get_output_variable().shape[1]
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(out_height, out_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            valid_pf_str = ','.join(map(str, valid_pf))
+            print(
+                f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
+                f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
+            )
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', out_height * out_width // closest_pf)
+
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+        self._validate_conv_strategy(layer)
+
+    @layer_optimizer(SeparableConv2D)
+    def init_sepconv2d(self, layer):
+        if layer.model.config.is_resource_strategy(layer):
+            layer.set_attr('strategy', 'resource')
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+        # Set the output type of the depthwise phase
+        dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output')
+        dw_out_name = layer.name + '_dw_out_t'
+        if layer.model.config.get_config_value('IOType') == 'io_stream':
+            dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1)
+        else:
+            dw_output_t = NamedType(dw_out_name, dw_out_precision)
+        layer.set_attr('dw_output_t', dw_output_t)
+
+    @layer_optimizer(DepthwiseConv2D)
+    def init_depconv2d(self, layer):
+        if layer.model.config.is_resource_strategy(layer):
+            layer.set_attr('strategy', 'resource')
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        layer.set_attr(
+            'n_partitions', 1
+        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+        # Set the output type of the depthwise phase
+        dw_out_precision, _ = layer.model.config.get_precision(layer, 'dw_output')
+        dw_out_name = layer.name + '_dw_out_t'
+        if layer.model.config.get_config_value('IOType') == 'io_stream':
+            dw_output_t = PackedType(dw_out_name, dw_out_precision, layer.get_attr('n_chan'), n_pack=1)
+        else:
+            dw_output_t = NamedType(dw_out_name, dw_out_precision)
+        layer.set_attr('dw_output_t', dw_output_t)
+
+    def _set_pooling_accum_t(self, layer, pool_size):
+        extra_bits = ceil_log2(pool_size)
+        accum_t = layer.get_attr('accum_t')
+        accum_t.precision.width += extra_bits * 2
+        if isinstance(accum_t.precision, FixedPrecisionType):
+            accum_t.precision.integer += extra_bits
+
+    @layer_optimizer(Pooling1D)
+    def init_pooling1d(self, layer):
+        pool_size = layer.get_attr('pool_width')
+        self._set_pooling_accum_t(layer, pool_size)
+
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+    @layer_optimizer(Pooling2D)
+    def init_pooling2d(self, layer):
+        pool_size = layer.get_attr('pool_height') * layer.get_attr('pool_width')
+        self._set_pooling_accum_t(layer, pool_size)
+
+        layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
+
+    @layer_optimizer(GlobalPooling1D)
+    def init_global_pooling1d(self, layer):
+        pool_size = layer.get_attr('n_in')
+        self._set_pooling_accum_t(layer, pool_size)
+
+    @layer_optimizer(GlobalPooling2D)
+    def init_global_pooling2d(self, layer):
+        pool_size = layer.get_attr('in_height') * layer.get_attr('in_width')
+        self._set_pooling_accum_t(layer, pool_size)
+
+    @layer_optimizer(Softmax)
+    def init_softmax(self, layer):
+        if layer.model.config.get_config_value('IOType') == 'io_parallel':
+            assert (
+                len(layer.get_input_variable().shape) == 1
+            ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
+
+    @layer_optimizer(Embedding)
+    def init_embed(self, layer):
+        if layer.attributes['n_in'] is None:
+            raise Exception('Input length of Embedding layer must be specified.')
+
+    @layer_optimizer(LSTM)
+    def init_lstm(self, layer):
+        # TODO Allow getting recurrent reuse factor from the config
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        if layer.model.config.is_resource_strategy(layer):
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False)))
+
+    @layer_optimizer(GRU)
+    def init_gru(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('recurrent_reuse_factor', reuse_factor)
+
+        if layer.model.config.is_resource_strategy(layer):
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+            layer.set_attr('strategy', 'resource')
+        else:
+            layer.set_attr('strategy', 'latency')
+
+        layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', IntegerPrecisionType(width=1, signed=False)))
+
+    @layer_optimizer(GarNet)
+    def init_garnet(self, layer):
+        reuse_factor = layer.attributes['reuse_factor']
+
+        var_converter = CatapultArrayVariableConverter(
+            type_converter=HLSTypeConverter(precision_converter=ACTypeConverter())
+        )
+
+        # A bit controversial but we are going to set the partitioning of the input here
+        in_layer = layer.model.graph[layer.inputs[0]]
+        in_var = layer.get_input_variable(layer.inputs[0])
+        partition_factor = in_var.shape[1] * (in_var.shape[0] // reuse_factor)
+        in_pragma = ('partition', 'cyclic', partition_factor)
+        new_in_var = var_converter.convert(in_var, pragma=in_pragma)
+        in_layer.set_attr(layer.inputs[0], new_in_var)
+
+        if layer.attributes['collapse']:
+            out_pragma = 'partition'
+        else:
+            partition_factor = layer._output_features * (layer.attributes['n_vertices'] // reuse_factor)
+            out_pragma = ('partition', 'cyclic', partition_factor)
+
+        out_name, out_var = next(iter(layer.variables.items()))
+        new_out_var = var_converter.convert(out_var, pragma=out_pragma)
+
+        layer.set_attr(out_name, new_out_var)
+
+    @layer_optimizer(GarNetStack)
+    def init_garnet_stack(self, layer):
+        self.init_garnet(layer)
diff --git a/hls4ml/backends/catapult/passes/__init__.py b/hls4ml/backends/catapult/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/catapult/passes/broadcast_stream.py b/hls4ml/backends/catapult/passes/broadcast_stream.py
new file mode 100644
index 0000000000..97019e074b
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/broadcast_stream.py
@@ -0,0 +1,117 @@
+import numpy as np
+
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class Broadcast(Layer):
+    '''Inserted between layers for broadcasting.'''
+
+    def initialize(self):
+        shape = self.attributes['target_shape']
+        if shape[0] is None:
+            shape = shape[1:]
+        dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)]
+        self.add_output_variable(shape, dims)
+
+
+broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});'
+broadcast_config_template = """struct config{index} : nnet::broadcast_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned in_height = {in_height};
+    static const unsigned in_chan = {in_chan};
+    static const unsigned out_width = {out_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_chan = {out_chan};
+}};\n"""
+broadcast_include_list = ['nnet_utils/nnet_stream.h']
+
+
+class BroadcastConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Broadcast)
+        self.template = broadcast_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['in_height'] = node.get_input_variable().shape[0]
+        params['in_width'] = node.get_input_variable().shape[1]
+        params['in_chan'] = node.get_input_variable().shape[2]
+        params['out_height'] = node.get_output_variable().shape[0]
+        params['out_width'] = node.get_output_variable().shape[1]
+        params['out_chan'] = node.get_output_variable().shape[2]
+
+        return self.template.format(**params)
+
+
+class BroadcastFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Broadcast, include_header=broadcast_include_list)
+        self.template = broadcast_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
+
+
+def register_broadcast_stream(backend):
+    # Register the layer types to the layer map
+    register_layer('Broadcast', Broadcast)
+
+    # Register the optimization passes
+    backend.register_pass('broadcast_stream', BroadcastStream)
+
+    # Register template passes
+    backend.register_template(BroadcastConfigTemplate)
+    backend.register_template(BroadcastFunctionTemplate)
+
+
+class BroadcastStream(OptimizerPass):
+    def match(self, node):
+        if isinstance(node, Merge) and not isinstance(node, Concatenate):
+            inp1 = node.get_input_variable(node.inputs[0])
+            inp2 = node.get_input_variable(node.inputs[1])
+            return inp1.shape != inp2.shape
+        else:
+            return False
+
+    def transform(self, model, node):
+        if model.config.backend.name not in ['Catapult'] or model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        inp = [node.get_input_variable(inp_name) for inp_name in node.inputs]
+
+        if np.prod(inp[0].shape) > np.prod(inp[1].shape):
+            idx = 1
+            attrs = {'target_shape': inp[0].shape}
+        else:
+            idx = 0
+            attrs = {'target_shape': inp[1].shape}
+
+        def supported_broadcast(inp_shape, target_shape):
+            # Must be (H, W, C)
+            if not len(inp_shape) == 3:
+                return False
+            # Supported: (1, 1, C) -> (H, W, C)
+            if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]:
+                return True
+            # Supported: (H, W, 1) -> (H, W, C)
+            if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]:
+                return True
+            return False
+
+        brdcst_inp = node.inputs[idx]
+        inp_shape = node.get_input_variable(brdcst_inp).shape
+        target_shape = attrs['target_shape']
+        if not supported_broadcast(inp_shape, target_shape):
+            raise RuntimeError(
+                f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};'
+                + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported'
+            )
+        brdcst_out = 'broadcast_' + brdcst_inp
+        brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy())
+        model.insert_node(brdcst_layer, before=node, input_idx=idx)
+        node.inputs[idx] = brdcst_out
+
+        return True
diff --git a/hls4ml/backends/catapult/passes/conv_same_pad.py b/hls4ml/backends/catapult/passes/conv_same_pad.py
new file mode 100755
index 0000000000..bb8354a3d0
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/conv_same_pad.py
@@ -0,0 +1,109 @@
+from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class InsertZeroPaddingBeforeConv1D(OptimizerPass):
+    name = 'insert_zero_padding_before_conv1d'
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, (Conv1D, SeparableConv1D))
+            and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal'))
+            and node.get_attr('filt_width') != 1
+        )
+        return is_match
+
+    def transform(self, model, node):
+        if model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        # Get the padding parameters from Conv1D layer
+        pad_left = node.get_attr('pad_left')
+        pad_right = node.get_attr('pad_right')
+
+        # Check if no padding needs to be done
+        if pad_left == pad_right == 0:
+            return False
+
+        out_width = pad_left + node.get_attr('in_width') + pad_right
+
+        attrs = {
+            'pad_left': pad_left,
+            'pad_right': pad_right,
+            'in_width': node.get_attr('in_width'),
+            'out_width': out_width,
+            'n_chan': node.get_attr('n_chan'),
+            'data_format': node.get_attr('data_format', 'channels_last'),
+        }
+
+        # Switch Conv1D layer padding to 'valid'
+        node.set_attr('padding', 'valid')
+        node.set_attr('pad_left', 0)
+        node.set_attr('pad_right', 0)
+        node.set_attr('in_width', out_width)
+
+        # Insert new ZeroPadding1D node above Conv1D
+        padding_layer = model.make_node('ZeroPadding1D', 'zp1d_' + node.name, attrs, node.inputs.copy())
+        padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision
+        model.insert_node(padding_layer)
+
+        return True
+
+
+class InsertZeroPaddingBeforeConv2D(OptimizerPass):
+    name = 'insert_zero_padding_before_conv2d'
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, (Conv2D, SeparableConv2D))
+            and node.get_attr('padding') == 'same'
+            and node.get_attr('filt_height') != 1
+            and node.get_attr('filt_width') != 1
+        )
+        return is_match
+
+    def transform(self, model, node):
+        if model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        # Get the padding parameters from Conv2D layer
+        pad_top = node.get_attr('pad_top')
+        pad_bottom = node.get_attr('pad_bottom')
+        pad_left = node.get_attr('pad_left')
+        pad_right = node.get_attr('pad_right')
+
+        # Check if no padding neeeds to be done
+        if pad_top == pad_bottom == pad_left == pad_right == 0:
+            return False
+
+        out_height = pad_top + node.get_attr('in_height') + pad_bottom
+        out_width = pad_left + node.get_attr('in_width') + pad_right
+
+        attrs = {
+            'pad_top': pad_top,
+            'pad_bottom': pad_bottom,
+            'pad_left': pad_left,
+            'pad_right': pad_right,
+            'in_height': node.get_attr('in_height'),
+            'in_width': node.get_attr('in_width'),
+            'out_height': out_height,
+            'out_width': out_width,
+            'n_chan': node.get_attr('n_chan'),
+            'data_format': node.get_attr('data_format', 'channels_last'),
+        }
+
+        # Switch Conv2D layer padding to 'valid'
+        node.set_attr('padding', 'valid')
+        node.set_attr('pad_top', 0)
+        node.set_attr('pad_bottom', 0)
+        node.set_attr('pad_left', 0)
+        node.set_attr('pad_right', 0)
+        node.set_attr('in_height', out_height)
+        node.set_attr('in_width', out_width)
+
+        # Insert new ZeroPadding2D node above Conv2D
+        padding_layer = model.make_node('ZeroPadding2D', 'zp2d_' + node.name, attrs, node.inputs.copy())
+        padding_layer.get_output_variable().type.precision = node.get_input_variable().type.precision
+        model.insert_node(padding_layer, before=node)
+
+        return True
diff --git a/hls4ml/backends/catapult/passes/conv_stream.py b/hls4ml/backends/catapult/passes/conv_stream.py
new file mode 100755
index 0000000000..e0bb853d83
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/conv_stream.py
@@ -0,0 +1,52 @@
+from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class GenerateConvStreamingInstructions(OptimizerPass):
+    '''Generates the instructions for streaming implementation of CNNs'''
+
+    def match(self, node):
+        return isinstance(node, (Conv1D, SeparableConv1D, Conv2D, SeparableConv2D))
+
+    def transform(self, model, node):
+        node_class = node.__class__.__name__
+        if '1D' in node_class:
+            self._generate_1d_instructions(node)
+        elif '2D' in node_class:
+            self._generate_2d_instructions(node)
+        else:
+            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+
+    def _generate_1d_instructions(self, node):
+        if node.model.config.get_config_value('IOType') == 'io_stream':
+            min_w, instructions = node.model.config.backend.compute_conv1d_instructions(
+                node.get_input_variable().shape[0],
+                node.get_input_variable().shape[1],
+                node.get_attr('filt_width'),
+                node.get_attr('stride_width'),
+            )
+            instructions_str = ','.join(str(i) for i in instructions)
+            node.set_attr('min_width', min_w)
+            node.set_attr('instructions', instructions_str)
+        else:
+            # these are unused; just put dummy values
+            node.set_attr('min_width', node.get_attr('in_width'))
+            node.set_attr('instructions', '0')
+
+    def _generate_2d_instructions(self, node):
+        if node.model.config.get_config_value('IOType') == 'io_stream':
+            min_h, min_w, instructions = node.model.config.backend.compute_conv2d_instructions(
+                node.get_input_variable().shape[0],
+                node.get_input_variable().shape[1],
+                node.get_input_variable().shape[2],
+                node.get_attr('filt_height'),
+                node.get_attr('stride_height'),
+            )
+            instructions_str = ','.join(str(i) for i in instructions)
+            node.set_attr('min_height', min_h)
+            node.set_attr('min_width', min_w)
+            node.set_attr('instructions', instructions_str)
+        else:
+            node.set_attr('min_height', node.get_attr('in_height'))
+            node.set_attr('min_width', node.get_attr('in_width'))
+            node.set_attr('instructions', '0')
diff --git a/hls4ml/backends/catapult/passes/convolution_templates.py b/hls4ml/backends/catapult/passes/convolution_templates.py
new file mode 100755
index 0000000000..8014a4ac8e
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/convolution_templates.py
@@ -0,0 +1,508 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import (
+    Conv1D,
+    Conv2D,
+    Conv2DBatchnorm,
+    DepthwiseConv1D,
+    DepthwiseConv2D,
+    SeparableConv1D,
+    SeparableConv2D,
+)
+
+# Shared multiplication template
+
+conv_mult_config_template = """struct config{index}_mult : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+# Conv1D templates
+
+conv1d_config_template = """struct config{index} : nnet::conv1d_config {{
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = {n_filt};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned dilation = {dilation};
+    static const unsigned out_width = {out_width};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit =
+        DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::{strategy};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned min_width = {min_width};
+    static const ac_int<filt_width,false> pixels[min_width];
+    static const unsigned n_partitions = {n_partitions};
+    static const unsigned n_pixels = out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index = nnet::{scale_index_type}<K, S, W>;
+}};
+// really this allocation of pixels array ought to be in a .cpp file
+#ifndef INCLUDED_MC_TESTBENCH_H
+const ac_int<config{index}::filt_width,false> config{index}::pixels[] = {{{instructions}}};
+#endif\n"""
+
+conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+depthconv1d_function_template = (
+    'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+
+conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
+
+
+class Conv1DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Conv1D, DepthwiseConv1D))
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('weight').nzeros
+
+        params['config_t'] = f'config{node.index}_mult'
+        if node.get_attr('in_width') == node.get_attr('min_width'):
+            params['scale_index_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_type'] = 'scale_index_regular'
+
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}'
+        else:
+            params['fill_fn'] = 'FillConv1DBuffer'
+
+        conv_config = self.template.format(**params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['nzeros'] = node.get_weights('weight').nzeros
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Conv1D, include_header=conv1d_include_list)
+        self.template = conv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=sepconv1d_include_list)
+        self.template = depthconv1d_function_template
+
+
+# Conv2D Templates
+
+conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned filt_height = {filt_height};
+    static const unsigned filt_width = {filt_width};
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = {n_filt};
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit =
+        DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
+    static const bool store_weights_in_bram = false;
+    static const unsigned strategy = nnet::{strategy};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned min_height = {min_height};
+    static const unsigned min_width = {min_width};
+    static const ac_int<filt_height * filt_width,false> pixels[min_height * min_width];
+    static const unsigned n_partitions = {n_partitions};
+    static const unsigned n_pixels = out_height * out_width / n_partitions;
+    template<class data_T, class CONFIG_T>
+    using fill_buffer = nnet::{fill_fn}<data_T, CONFIG_T>;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {config_t} mult_config;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index_height = nnet::{scale_index_height_type}<K, S, W>;
+    template<unsigned K, unsigned S, unsigned W>
+    using scale_index_width = nnet::{scale_index_width_type}<K, S, W>;
+}};
+// really this allocation of pixels array ought to be in a .cpp file
+#ifndef INCLUDED_MC_TESTBENCH_H
+const ac_int<config{index}::filt_height * config{index}::filt_width,false> config{index}::pixels[] = {{{instructions}}};
+#endif\n"""
+
+conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+depthconv2d_function_template = (
+    'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+
+conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
+
+
+class Conv2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D))
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('weight').nzeros
+
+        params['config_t'] = f'config{node.index}_mult'
+
+        if node.get_attr('in_height') == node.get_attr('min_height'):
+            params['scale_index_height_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_height_type'] = 'scale_index_regular'
+
+        if node.get_attr('in_width') == node.get_attr('min_width'):
+            params['scale_index_width_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_width_type'] = 'scale_index_regular'
+
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}'
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
+
+        conv_config = self.template.format(**params)
+
+        mult_params = self._default_config_params(node)
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['nzeros'] = node.get_weights('weight').nzeros
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_config = self.mult_template.format(**mult_params)
+
+        return mult_config + '\n' + conv_config
+
+
+class Conv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Conv2D, Conv2DBatchnorm), include_header=conv2d_include_list)
+        self.template = conv2d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=sepconv2d_include_list)
+        self.template = depthconv2d_function_template
+
+
+# SeparableConv1D/2D Templates
+
+sepconv_config_template = """struct config{index} {{
+    typedef {depthwise_config} depthwise_config;
+    typedef {pointwise_config} pointwise_config;
+}};\n"""
+
+sepconv1d_function_template = (
+    'nnet::separable_conv_1d_{data_format}<{input_t}, {dw_output_t}, {output_t}, {config}>('
+    '{input}, {output}, {d}, {p}, {z}, {b});'
+)
+sepconv2d_function_template = (
+    'nnet::separable_conv_2d_{data_format}<{input_t}, {dw_output_t}, {output_t}, {config}>('
+    '{input}, {output}, {d}, {p}, {z}, {b});'
+)
+
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
+
+
+class SeparableConv1DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SeparableConv1D)
+        self.template = sepconv_config_template
+        self.depthwise_template = conv1d_config_template
+        self.pointwise_template = conv1d_config_template
+        self.depthwise_mult_template = conv_mult_config_template
+        self.pointwise_mult_template = conv_mult_config_template
+
+    def format(self, node):
+        # Separable master config
+        params = {}
+        params['index'] = node.index
+        params['depthwise_config'] = f'config{node.index}_depthwise'
+        params['pointwise_config'] = f'config{node.index}_pointwise'
+        sep_config = self.template.format(**params)
+
+        # Depthwise config
+        params = self._default_config_params(node)
+        # Override bias and bias_t since these are zeros in depthwise step of SepConv1D
+        params['bias'] = params['zero_bias']
+        params['bias_t'] = params['zero_bias_t']
+        params['n_filt'] = params['n_chan']  # In depthwise step n_chan == n_filt
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('depthwise').nzeros
+        params['index'] = str(node.index) + '_depthwise'
+        params['weight_t'] = node.get_weights('depthwise').type
+        params['fill_fn'] = 'FillConv1DBuffer'
+
+        if node.get_attr('unscaled'):
+            params['scale_index_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_type'] = 'scale_index_regular'
+
+        params['config_t'] = f'config{node.index}_depthwise_mult'
+        depthwise_config = self.depthwise_template.format(**params)
+
+        # Depthwise mult config
+        mult_params = self._default_config_params(node)
+        mult_params['index'] = str(node.index) + '_depthwise'
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_chan')
+        mult_params['nzeros'] = node.get_weights('depthwise').nzeros
+        mult_params['weight_t'] = node.get_weights('depthwise').type
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
+        )
+        depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
+
+        # Pointwise config
+        params = self._default_config_params(node)
+        if node.get_attr('data_format') == 'channels_last':
+            params['in_width'] = node.get_output_variable().shape[0]
+        else:
+            params['in_width'] = node.get_output_variable().shape[1]
+
+        params['filt_width'] = 1
+        params['stride_width'] = 1
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('pointwise').nzeros
+        params['index'] = str(node.index) + '_pointwise'
+        params['weight_t'] = node.get_weights('pointwise').type
+        params['min_width'] = params['in_width']
+        params['instructions'] = '0'
+        params['fill_fn'] = 'FillConv1DBuffer'
+
+        if node.get_attr('unscaled'):
+            params['scale_index_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_type'] = 'scale_index_regular'
+
+        params['config_t'] = f'config{node.index}_pointwise_mult'
+        pointwise_config = self.pointwise_template.format(**params)
+
+        # Pointwise mult config
+        mult_params = self._default_config_params(node)
+        mult_params['index'] = str(node.index) + '_pointwise'
+        mult_params['n_in'] = node.get_attr('n_chan')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['nzeros'] = node.get_weights('pointwise').nzeros
+        mult_params['weight_t'] = node.get_weights('pointwise').type
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
+        )
+        pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
+
+        return (
+            depthwise_mult_config
+            + '\n'
+            + depthwise_config
+            + '\n'
+            + pointwise_mult_config
+            + '\n'
+            + pointwise_config
+            + '\n'
+            + sep_config
+        )
+
+
+class SeparableConv1DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SeparableConv1D, include_header=sepconv1d_include_list)
+        self.template = sepconv1d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dw_output_t'] = node.get_attr('dw_output_t').name
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+        params['d'] = node.get_weights('depthwise').name
+        params['p'] = node.get_weights('pointwise').name
+        params['b'] = node.get_weights('bias').name
+        params['z'] = node.get_weights('zero_bias').name
+
+        return self.template.format(**params)
+
+
+class SeparableConv2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SeparableConv2D)
+        self.template = sepconv_config_template
+        self.depthwise_template = conv2d_config_template
+        self.pointwise_template = conv2d_config_template
+        self.depthwise_mult_template = conv_mult_config_template
+        self.pointwise_mult_template = conv_mult_config_template
+
+    def format(self, node):
+        # Separable master config
+        params = {}
+        params['index'] = node.index
+        params['depthwise_config'] = f'config{node.index}_depthwise'
+        params['pointwise_config'] = f'config{node.index}_pointwise'
+        sep_config = self.template.format(**params)
+
+        # Depthwise config
+        params = self._default_config_params(node)
+        # Override bias and bias_t since these are zeros in depthwise step of SepConv2D
+        params['bias'] = params['zero_bias']
+        params['bias_t'] = params['zero_bias_t']
+        params['n_filt'] = params['n_chan']  # In depthwise step n_chan == n_filt
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('depthwise').nzeros
+        params['index'] = str(node.index) + '_depthwise'
+        params['weight_t'] = node.get_weights('depthwise').type
+        params['fill_fn'] = 'FillConv2DBuffer'
+
+        if node.get_attr('unscaled_h'):
+            params['scale_index_height_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_height_type'] = 'scale_index_regular'
+
+        if node.get_attr('unscaled_w'):
+            params['scale_index_width_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_width_type'] = 'scale_index_regular'
+
+        params['config_t'] = f'config{node.index}_depthwise_mult'
+        depthwise_config = self.depthwise_template.format(**params)
+
+        # Depthwise mult config
+        mult_params = self._default_config_params(node)
+        mult_params['index'] = str(node.index) + '_depthwise'
+        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_height') * node.get_attr('filt_width')
+        mult_params['n_out'] = node.get_attr('n_chan')
+        mult_params['nzeros'] = node.get_weights('depthwise').nzeros
+        mult_params['weight_t'] = node.get_weights('depthwise').type
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
+        )
+        depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
+
+        # Pointwise config
+        params = self._default_config_params(node)
+        if node.get_attr('data_format') == 'channels_last':
+            params['in_height'] = node.get_output_variable().shape[0]
+            params['in_width'] = node.get_output_variable().shape[1]
+        else:
+            params['in_height'] = node.get_output_variable().shape[1]
+            params['in_width'] = node.get_output_variable().shape[2]
+
+        params['filt_height'] = params['filt_width'] = 1
+        params['stride_height'] = params['stride_width'] = 1
+        params['dilation'] = node.get_attr('dilation', 1)
+        params['nzeros'] = node.get_weights('pointwise').nzeros
+        params['index'] = str(node.index) + '_pointwise'
+        params['weight_t'] = node.get_weights('pointwise').type
+        params['min_height'] = params['in_height']
+        params['min_width'] = params['in_width']
+        params['instructions'] = '0'
+        params['fill_fn'] = 'FillConv2DBuffer'
+
+        if node.get_attr('unscaled_h'):
+            params['scale_index_height_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_height_type'] = 'scale_index_regular'
+
+        if node.get_attr('unscaled_w'):
+            params['scale_index_width_type'] = 'scale_index_unscaled'
+        else:
+            params['scale_index_width_type'] = 'scale_index_regular'
+        params['config_t'] = f'config{node.index}_pointwise_mult'
+        pointwise_config = self.pointwise_template.format(**params)
+
+        # Pointwise mult config
+        mult_params = self._default_config_params(node)
+        mult_params['index'] = str(node.index) + '_pointwise'
+        mult_params['n_in'] = node.get_attr('n_chan')
+        mult_params['n_out'] = node.get_attr('n_filt')
+        mult_params['nzeros'] = node.get_weights('pointwise').nzeros
+        mult_params['weight_t'] = node.get_weights('pointwise').type
+        mult_params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
+        )
+        pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
+
+        return (
+            depthwise_mult_config
+            + '\n'
+            + depthwise_config
+            + '\n'
+            + pointwise_mult_config
+            + '\n'
+            + pointwise_config
+            + '\n'
+            + sep_config
+        )
+
+
+class SeparableConv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SeparableConv2D, include_header=sepconv2d_include_list)
+        self.template = sepconv2d_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dw_output_t'] = node.get_attr('dw_output_t').name
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+        params['d'] = node.get_weights('depthwise').name
+        params['p'] = node.get_weights('pointwise').name
+        params['b'] = node.get_weights('bias').name
+        params['z'] = node.get_weights('zero_bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/convolution_winograd.py b/hls4ml/backends/catapult/passes/convolution_winograd.py
new file mode 100644
index 0000000000..8b25ab41b8
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/convolution_winograd.py
@@ -0,0 +1,175 @@
+import math
+
+import numpy as np
+
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyWinogradKernelTransformation(OptimizerPass):
+    '''
+    Transforms the weights of a Conv2D kernel to a format suitable for Wingorad convolution
+    For further information, refer to Lavin & Gray, 2015 - Fast Algorithms for Convolutional Neural Networks
+    '''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Conv1D, Conv2D))
+
+        # This optimizer works only after the Resource Strategy Optimizer, since order of transposition matters
+        weights_transformed = node.get_attr('_weights_transposed', False) is True
+
+        # User opted for Winograd
+        implementation_is_winograd = (
+            node.get_attr('implementation', 'combination') == 'combination'
+            or node.get_attr('implementation', 'combination') == 'winograd'
+        )
+
+        parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel'
+
+        # Winograd algorithm-specific conditions
+        if isinstance(node, Conv1D):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with stride != 1
+            stride_is_one = node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_width') > 2
+
+            winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type
+
+        elif isinstance(node, (Conv2D)):
+            # Winograd only applies to specific kernel sizes
+            # Current implementation only supports fs = 3; easily extendable to other filter sizes
+            filter_size_matches = node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3
+
+            # Winograd's minimal filtering algorithm doesn't work with striede != 1
+            stride_is_one = node.get_attr('stride_height', 1) == 1 and node.get_attr('stride_width', 1) == 1
+
+            # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
+            loop_itr_gt_one = node.get_attr('out_height') > 2 and node.get_attr('out_width') > 2
+
+            padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr(
+                'pad_left', 0
+            ) == node.get_attr('pad_right', 0)
+
+            winograd_conditions = (
+                filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type
+            )
+
+        else:
+            winograd_conditions = False
+
+        # Check any previous transformations
+        already_transformed = node.get_attr('_winograd_transformation_applied', False) is True
+
+        if not winograd_conditions and node.get_attr('implementation', 'combination') == 'winograd':
+            raise RuntimeError(
+                'Not possible to use Winograd algorithm with current architecture. '
+                'Please set implementation to im2col or combination'
+            )
+
+        return (
+            node_matches
+            and weights_transformed
+            and winograd_conditions
+            and not already_transformed
+            and implementation_is_winograd
+        )
+
+    def transform(self, model, node):
+        if isinstance(node, Conv1D):
+            if node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, W, C)
+                # Therefore, (F, W, C) => (F, C, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 2, 1])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3) => (4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4))
+
+                # Transformation matrices for 3x1 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(G, weights[filter][channel])
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_width', 4)
+
+        elif isinstance(node, Conv2D):
+            if node.get_attr('filt_height', 3) == 3 and node.get_attr('filt_width', 3) == 3:
+                # First, transpose to a format suitable for the Winograd algorithm (F, C, H, W)
+                # Note, this assumes a format post-resource strategy optimizer, that is (F, H, W, C)
+                # Therefore, (F, H, W, C) => (F, C, H, W)
+                node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[0, 3, 1, 2])
+
+                # Temporary copy of data
+                weights = node.weights['weight'].data
+
+                # Expand weight dimensionality (3x3) => (4x4)
+                node.weights['weight'].data = np.zeros((weights.shape[0], weights.shape[1], 4, 4))
+
+                # Transformation matrices for 3x3 kernels
+                G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
+                GT = np.array([[1, 0.5, 0.5, 0], [0, 0.5, -0.5, 0], [0, 0.5, 0.5, 1]])
+
+                # Transformation GfG'
+                for filter in range(0, weights.data.shape[0]):
+                    for channel in range(0, weights.data.shape[1]):
+                        node.weights['weight'].data[filter][channel] = np.matmul(np.matmul(G, weights[filter][channel]), GT)
+                        node.weights['weight'].data_length = node.weights['weight'].data.size
+
+                # Winograd's minimal filtering algorithm transforms the weight matrix
+                # This transformation consists of addition and division (by 2&4) of the weight matrix
+                # Therefore, increase precision (if needed), to accomodate for new weights
+                # This error is only noticeable for low precisions, such as those used with QKeras
+
+                # Integer precision is only updated if it exceeds the one defined in hls4ml config
+                maximum_value_rounded = int(math.ceil(np.abs(node.weights['weight'].data).max()))
+                if maximum_value_rounded.bit_length() + 1 > node.weights['weight'].type.precision.integer:
+                    node.weights['weight'].type.precision.integer = maximum_value_rounded.bit_length() + 1
+                    node.weights['weight'].type.precision.width += (
+                        maximum_value_rounded.bit_length() + 1 - node.weights['weight'].type.precision.integer
+                    )
+
+                # Fractional precision is increased by 2 bits (division by 4),
+                # for low-precision (less than 8) fractional weights
+                if node.weights['weight'].type.precision.fractional < 8:
+                    node.weights['weight'].type.precision.width += 2
+
+                # Modified kernel size
+                node.set_attr('impl_filt_height', 4)
+                node.set_attr('impl_filt_width', 4)
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with Winograd kernel optimizer')
+
+        node.set_attr('_winograd_transformation_applied', True)
+
+        return False
diff --git a/hls4ml/backends/catapult/passes/core_templates.py b/hls4ml/backends/catapult/passes/core_templates.py
new file mode 100755
index 0000000000..2088923428
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/core_templates.py
@@ -0,0 +1,216 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+
+# Dense templates
+
+dense_config_template = """struct config{index} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    static const bool store_weights_in_bram = false;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {index_t.name} index_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+dense_function_template = 'nnet::dense<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
+
+
+class DenseConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['nzeros'] = node.get_weights('weight').nzeros
+        params['nonzeros'] = node.get_weights('weight').nonzeros
+        params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class DenseFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense, include_header=dense_include_list)
+        self.template = dense_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# BatchNormalization templates
+
+batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
+    static const bool store_weights_in_bram = false;
+    typedef {bias_t.name} bias_t;
+    typedef {scale_t.name} scale_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
+
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# Activation templates
+
+activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+hard_activ_config_template = """struct {type}_config{index} {{
+    static const unsigned n_in = {n_in};
+    static const {slope_t.name} slope;
+    static const {shift_t.name} shift;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+}};
+// really this allocation of pixels array ought to be in a .cpp file
+#ifndef INCLUDED_MC_TESTBENCH_H
+const {slope_t.name} {type}_config{index}::slope = {slope};
+const {shift_t.name} {type}_config{index}::shift = {shift};
+#endif\n"""
+
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned axis = {axis};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef {exp_table_t.name} exp_table_t;
+    typedef {inv_table_t.name} inv_table_t;
+}};\n"""
+
+activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
+param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
+
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
+
+
+class ActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Activation, ParametrizedActivation, PReLU))
+        self.template = activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class HardActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HardActivation)
+        self.template = hard_activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class SoftmaxConfigTemplate(ActivationConfigTemplate):
+    def __init__(self):
+        super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
+        self.template = softmax_config_template
+
+
+class ActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
+
+
+class ParametrizedActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ParametrizedActivation, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node._get_act_function_name()
+        params['param'] = node.get_attr('activ_param', 1.0)
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
+
+
+class PReLUFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(PReLU, include_header=activ_include_list)
+        self.template = param_activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['param'] = node.get_weights('alpha').name
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/fifo_depth_optimization.py b/hls4ml/backends/catapult/passes/fifo_depth_optimization.py
new file mode 100755
index 0000000000..4d92e98de1
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/fifo_depth_optimization.py
@@ -0,0 +1,104 @@
+import json
+
+from pyDigitalWaveTools.vcd.parser import VcdParser
+
+from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+def populate_values(values, name, data, depth):
+    def get_values(x):
+        return int(x[1][1:], 2)
+
+    values.append({'name': name, 'data': [], 'max': 0, 'depth': 0})
+    values[-1]['data'] = [get_values(x) for x in data]
+    values[-1]['max'] = max(values[-1]['data'])
+    values[-1]['depth'] = int(depth[0][1][1:], 2)
+    return values
+
+
+def set_big_fifos(vars_to_profile, profiling_fifo_depth):
+    for v in vars_to_profile.values():
+        if v.pragma:
+            v.pragma = (v.pragma[0], profiling_fifo_depth)
+
+
+def get_vcd_data(model):
+    model.write()
+    model.build(reset=False, csim=True, synth=True, cosim=True, validation=False, export=False, vsynth=False, fifo_opt=True)
+
+    with open(
+        model.config.get_output_dir()
+        + '/'
+        + model.config.get_project_name()
+        + '_prj'
+        + '/solution1/sim/verilog/fifo_opt.vcd'
+    ) as vcd_file:
+        vcd = VcdParser()
+        vcd.parse(vcd_file)
+        data = vcd.scope.toJson()
+    return data
+
+
+def generate_max_depth_file(model, maxs):
+    with open(model.config.get_output_dir() + '/max_depth.json', 'w') as f:
+        json.dump(maxs, f, indent=4)
+
+
+def set_fifo_depth(model, maxs):
+    for v in model.output_vars.values():
+        if v.pragma:
+            filtered_max = [x['max'] for x in maxs if v.name in x['name']]
+            if len(filtered_max) == 0:
+                continue
+            if len(filtered_max) > 1:
+                print('WARNING! Check names of FIFOs')
+            v.pragma = (v.pragma[0], filtered_max[0] + 1)
+
+
+class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+    def __init__(self):
+        self.values = []
+
+    def transform(self, model):
+        # use `large_fifo_depth = 0` to keep the default fifo depth
+        profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000)
+
+        # check axi-stream or io-stream, if not one the 2 exit
+        if not (model.config.get_config_value('IOType') == 'io_stream'):
+            raise RuntimeError('To use this optimization you have to set `IOType` field to `io_stream` in the HLS config')
+
+        # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs
+        # and so they will be profiled
+        if profiling_fifo_depth:
+            vars_to_profile = {
+                k: v
+                for k, v in model.output_vars.items()
+                if v != model.get_output_variables()[0] and v != model.get_input_variables()[0]
+            }
+
+            set_big_fifos(vars_to_profile, profiling_fifo_depth)
+
+        data = get_vcd_data(model)
+
+        if len(data['children']) == 0:
+            print(
+                "FIFO depth optimization found no FIFOs implemented using BRAMs in the design, no optimization is possible."
+            )
+            print("Consider increasing profiling_fifo_depth.")
+            return False
+
+        n_elem = len(data['children'][0]['children'][0]['children'])
+        for i in range(n_elem):
+            name = data['children'][0]['children'][0]['children'][i]['name']
+            data_p = data['children'][0]['children'][0]['children'][i]['children'][0]['data']
+            depth = data['children'][0]['children'][0]['children'][i]['children'][1]['data']
+            populate_values(self.values, name, data_p, depth)
+
+        maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values]
+
+        generate_max_depth_file(model, maxs)
+
+        set_fifo_depth(model, maxs)
+
+        print('[hls4ml] - FIFO optimization completed')
+        return False
diff --git a/hls4ml/backends/catapult/passes/garnet_templates.py b/hls4ml/backends/catapult/passes/garnet_templates.py
new file mode 100755
index 0000000000..f73f627683
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/garnet_templates.py
@@ -0,0 +1,249 @@
+import numpy as np
+
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GarNet, GarNetStack
+from hls4ml.model.types import FixedPrecisionType
+
+# GarNet templates
+
+garnet_common_config_template = """
+    static const unsigned n_vertices = {n_vertices};
+    static const unsigned n_vertices_width = {n_vertices_width};
+    static const unsigned n_in_features = {n_in_features};
+    static const unsigned distance_width = {distance_width};
+    static const unsigned output_collapse = {collapse_type};
+    static const bool mean_by_nvert = {mean_by_nvert};
+
+    typedef {norm_t} norm_t;
+    typedef ac_fixed<{distance_width}, {distance_nint}, true, AC_TRN, AC_SAT> distance_t;
+    typedef {edge_weight_t} edge_weight_t;
+    typedef {edge_weight_aggr_t} edge_weight_aggr_t;
+    typedef {aggr_t} aggr_t;
+    typedef {output_t} output_t;
+
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned log2_reuse_factor = {log2_reuse};
+"""
+
+garnet_config_template = """struct config{index} : nnet::garnet_config {{"""
+garnet_config_template += garnet_common_config_template
+garnet_config_template += """
+    static const unsigned n_propagate = {n_propagate};
+    static const unsigned n_aggregators = {n_aggregators};
+    static const unsigned n_out_features = {n_out_features};
+
+    typedef {input_transform_weights_t} input_transform_weights_t;
+    typedef {input_transform_biases_t} input_transform_biases_t;
+    typedef {aggregator_distance_weights_t} aggregator_distance_weights_t;
+    typedef {aggregator_distance_biases_t} aggregator_distance_biases_t;
+    typedef {output_transform_weights_t} output_transform_weights_t;
+    typedef {output_transform_biases_t} output_transform_biases_t;
+
+    static const input_transform_weights_t (&input_transform_weights)[{input_transform_weights_size}];
+    static const input_transform_biases_t (&input_transform_biases)[{input_transform_biases_size}];
+    static const aggregator_distance_weights_t (&aggregator_distance_weights)[{aggregator_distance_weights_size}];
+    static const aggregator_distance_biases_t (&aggregator_distance_biases)[{aggregator_distance_biases_size}];
+    static const output_transform_weights_t (&output_transform_weights)[{output_transform_weights_size}];
+    static const output_transform_biases_t (&output_transform_biases)[{output_transform_biases_size}];
+
+    typedef config{index} base_t;
+}};
+
+const config{index}::input_transform_weights_t (&config{index}::input_transform_weights)[{input_transform_weights_size}] = {input_transform_weights};
+const config{index}::input_transform_biases_t (&config{index}::input_transform_biases)[{input_transform_biases_size}] = {input_transform_biases};
+const config{index}::aggregator_distance_weights_t (&config{index}::aggregator_distance_weights)[{aggregator_distance_weights_size}] = {aggregator_distance_weights};
+const config{index}::aggregator_distance_biases_t (&config{index}::aggregator_distance_biases)[{aggregator_distance_biases_size}] = {aggregator_distance_biases};
+const config{index}::output_transform_weights_t (&config{index}::output_transform_weights)[{output_transform_weights_size}] = {output_transform_weights};
+const config{index}::output_transform_biases_t (&config{index}::output_transform_biases)[{output_transform_biases_size}] = {output_transform_biases};
+"""  # noqa: E501
+
+garnet_function_template = (
+    'nnet::garnet{impl}<{input_t}, {integer_input_t}, {output_t}, {config}>({input}, {nvtx}, {output});'
+)
+
+garnet_include_list = ['nnet_utils/nnet_garnet.h']
+
+
+class GarNetConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(GarNet)
+        self.template = (garnet_config_template,)
+
+    def get_transforms_config(self, node, params):
+        params['n_in_features'] = node.attributes['n_in_features']
+        params['n_propagate'] = node.attributes['n_propagate']
+        params['n_aggregators'] = node.get_weights('aggregator_distance_biases').shape[0]
+        params['n_out_features'] = node.get_weights('output_transform_biases').shape[0]
+
+        for wname, weights in node.weights.items():
+            params[wname] = weights.name
+            params[f'{wname}_t'] = weights.type.name
+            params[f'{wname}_size'] = weights.data_length
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        params['n_vertices'] = node.attributes['n_vertices']
+        params['n_vertices_width'] = int(np.log2(params['n_vertices']))
+        params['distance_width'] = 12
+        params['distance_nint'] = min(4, params['distance_width'] - 6)  # this is tuned
+        params['log2_reuse'] = int(np.log2(params['reuse']))
+
+        # Define default precisions for various internal arrays (can be overridden from the config file)
+        # We always give 10 digits for the subintegral part
+        fwidth = 10
+        # Integral precision for aggr_t depends on how large the temporary sum for weighed feature mean will be
+        aggr_intw = max(params['log2_reuse'], params['n_vertices_width'] - params['log2_reuse']) + 3  # safety factor 2**3
+        aggr_w = aggr_intw + fwidth
+        # edge_weight_aggr_t does not need the safety factor
+        ew_aggr_intw = aggr_intw - 3
+        ew_aggr_w = ew_aggr_intw + fwidth
+        # Integral precision for norm is fixed to 4
+        norm_intw = 4
+        norm_w = norm_intw + fwidth
+
+        vspecs = [
+            ('edge_weight', FixedPrecisionType(10, 0, signed=False)),
+            ('edge_weight_aggr', FixedPrecisionType(ew_aggr_w, ew_aggr_intw, signed=False)),
+            ('aggr', FixedPrecisionType(aggr_w, aggr_intw)),
+            ('norm', FixedPrecisionType(norm_w, norm_intw, signed=False)),
+        ]
+        precision_converter = ACTypeConverter()
+        for vname, default_precision in vspecs:
+            params[f'{vname}_t'], type_name = node.model.config.get_precision(node, var=vname)
+            if type_name.endswith('default_t'):
+                params[f'{vname}_t'] = precision_converter.convert(default_precision).definition_cpp()
+            else:
+                params[f'{vname}_t'] = precision_converter.convert(params[f'{vname}_t']).definition_cpp()
+        params['output_t'] = node.get_output_variable().type.name
+
+        if node.attributes['collapse'] in ['mean', 'max']:
+            params['collapse_type'] = 'collapse_{}'.format(node.attributes['collapse'])
+        else:
+            params['collapse_type'] = 'no_collapse'
+
+        params['mean_by_nvert'] = str(node.attributes['mean_by_nvert']).lower()
+
+        self.get_transforms_config(node, params)
+
+        return self.template[0].format(**params)
+
+
+class GarNetFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(GarNet, include_header=garnet_include_list)
+        self.template = garnet_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        data = node.get_input_variable(node.inputs[0])
+        integer_input = node.get_input_variable(node.inputs[1])
+        params['input_t'] = data.type.name
+        params['input'] = data.name
+
+        params['integer_input_t'] = integer_input.type.name
+        params['nvtx'] = integer_input.name
+
+        if node.ref_impl:
+            params['impl'] = '_ref'
+        else:
+            params['impl'] = ''
+
+        return self.template.format(**params)
+
+
+# GarNetStack Templates
+
+garnet_stack_base_config_template = """struct config{index}_base : nnet::garnet_config {{"""
+garnet_stack_base_config_template += garnet_common_config_template
+garnet_stack_base_config_template += """
+    static const bool is_stack = true;
+
+    typedef config{index}_base base_t;
+}};
+
+struct config{index} : config{index}_base {{
+    static const unsigned n_sublayers = {n_sublayers};
+
+    template<int L>
+    struct sublayer_t : config{index}_base {{}};
+}};
+
+{sublayer_configs}
+"""
+
+garnet_stack_sublayer_config_template = """template<>
+struct config{index}::sublayer_t<{il}> : config{index}_base {{
+    static const unsigned n_in_features = {n_in_features};
+    static const unsigned n_propagate = {n_propagate};
+    static const unsigned n_aggregators = {n_aggregators};
+    static const unsigned n_out_features = {n_out_features};
+
+    typedef {input_transform_weights_t} input_transform_weights_t;
+    typedef {input_transform_biases_t} input_transform_biases_t;
+    typedef {aggregator_distance_weights_t} aggregator_distance_weights_t;
+    typedef {aggregator_distance_biases_t} aggregator_distance_biases_t;
+    typedef {output_transform_biases_t} output_transform_biases_t;
+
+    static const input_transform_weights_t (&input_transform_weights)[{input_transform_weights_size}];
+    static const input_transform_biases_t (&input_transform_biases)[{input_transform_biases_size}];
+    static const aggregator_distance_weights_t (&aggregator_distance_weights)[{aggregator_distance_weights_size}];
+    static const aggregator_distance_biases_t (&aggregator_distance_biases)[{aggregator_distance_biases_size}];
+    static const output_transform_biases_t (&output_transform_biases)[{output_transform_biases_size}];
+
+    typedef config{index}::sublayer_t<{next}> next_layer_t;
+}};
+
+const config{index}::sublayer_t<{il}>::input_transform_weights_t (&config{index}::sublayer_t<{il}>::input_transform_weights)[{input_transform_weights_size}] = {input_transform_weights};
+const config{index}::sublayer_t<{il}>::input_transform_biases_t (&config{index}::sublayer_t<{il}>::input_transform_biases)[{input_transform_biases_size}] = {input_transform_biases};
+const config{index}::sublayer_t<{il}>::aggregator_distance_weights_t (&config{index}::sublayer_t<{il}>::aggregator_distance_weights)[{aggregator_distance_weights_size}] = {aggregator_distance_weights};
+const config{index}::sublayer_t<{il}>::aggregator_distance_biases_t (&config{index}::sublayer_t<{il}>::aggregator_distance_biases)[{aggregator_distance_biases_size}] = {aggregator_distance_biases};
+const config{index}::sublayer_t<{il}>::output_transform_biases_t (&config{index}::sublayer_t<{il}>::output_transform_biases)[{output_transform_biases_size}] = {output_transform_biases};
+"""  # noqa: E501
+
+garnet_stack_config_template = (garnet_stack_base_config_template, garnet_stack_sublayer_config_template)
+garnet_stack_function_template = (
+    'nnet::garnet_stack<{input_t}, {integer_input_t}, {output_t}, {config}>({input}, {nvtx}, {output});'
+)
+
+
+class GarNetStackConfigTemplate(GarNetConfigTemplate):
+    def __init__(self):
+        super(GarNetConfigTemplate, self).__init__(GarNetStack)
+        self.template = garnet_stack_config_template
+
+    def get_transforms_config(self, node, params):
+        _, sublayer_template = self.template
+
+        params['n_sublayers'] = node.attributes['n_sublayers']
+        params['n_in_features'] = node.attributes['n_in_features'][0]
+        params['n_out_features'] = node.attributes['n_out_features'][-1]
+
+        sublayer_configs = []
+        for il in range(node.attributes['n_sublayers'] - 1, -1, -1):
+            sub_params = {'index': node.index, 'il': il}
+
+            for p in ['n_in_features', 'n_propagate', 'n_aggregators', 'n_out_features']:
+                sub_params[p] = node.attributes[p][il]
+
+            for wname, weights in node._sublayer_weights[il].items():
+                sub_params[wname] = weights.name
+                sub_params[f'{wname}_t'] = weights.type.name
+                sub_params[f'{wname}_size'] = weights.data_length
+
+            if il != node.attributes['n_sublayers'] - 1:
+                sub_params['next'] = il + 1
+            else:
+                sub_params['next'] = 0
+
+            sublayer_configs.append(sublayer_template.format(**sub_params))
+
+        params['sublayer_configs'] = '\n'.join(sublayer_configs)
+
+
+class GarNetStackFunctionTemplate(GarNetFunctionTemplate):
+    def __init__(self):
+        super(GarNetFunctionTemplate, self).__init__(GarNetStack, include_header=garnet_include_list)
+        self.template = garnet_stack_function_template
diff --git a/hls4ml/backends/catapult/passes/merge_templates.py b/hls4ml/backends/catapult/passes/merge_templates.py
new file mode 100755
index 0000000000..ff6928679c
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/merge_templates.py
@@ -0,0 +1,106 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Dot, Merge
+
+# Merge templates
+
+merge_config_template = """struct config{index} : nnet::merge_config {{
+    static const unsigned n_elem = {n_elem};
+}};\n"""
+
+merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
+
+merge_include_list = ['nnet_utils/nnet_merge.h', 'nnet_utils/nnet_merge_stream.h']
+
+
+class MergeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Merge)
+        self.template = merge_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_elem'] = node.get_input_variable(node.inputs[0]).size_cpp()
+
+        return self.template.format(**params)
+
+
+class MergeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Merge, Concatenate, Dot), include_header=merge_include_list)
+        self.template = merge_function_template
+
+    def format(self, node):
+        params = {}
+        params['merge'] = node.get_attr('op').lower()
+        params['config'] = f'config{node.index}'
+        params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name
+        params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
+        params['output_t'] = node.get_output_variable().type.name
+        params['input1'] = node.get_input_variable(node.inputs[0]).name
+        params['input2'] = node.get_input_variable(node.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+
+        return self.template.format(**params)
+
+
+# Dot templates
+
+dot_config_template = """struct config{index} : nnet::dot_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
+    typedef {accum_t.name} accum_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+
+class DotConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dot)
+        self.template = dot_config_template
+
+    def format(self, node):
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        params = self._default_config_params(node)
+        params['n_out'] = 1
+        params['n_in'] = inp1.shape[0]
+        params['product_type'] = get_backend('catapult').product_type(inp1.type.precision, inp2.type.precision)
+
+        return self.template.format(**params)
+
+
+# Concatenate templates
+
+concat_config_template = """struct config{index} : nnet::concat_config {{
+    static const unsigned n_elem1_0 = {n_elem1_0};
+    static const unsigned n_elem1_1 = {n_elem1_1};
+    static const unsigned n_elem1_2 = {n_elem1_2};
+    static const unsigned n_elem2_0 = {n_elem2_0};
+    static const unsigned n_elem2_1 = {n_elem2_1};
+    static const unsigned n_elem2_2 = {n_elem2_2};
+
+    static const int axis = {axis};
+}};\n"""
+
+
+class ConcatenateConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Concatenate)
+        self.template = concat_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        for i in range(3):
+            params.setdefault(f'n_elem1_{i}', 0)
+            params.setdefault(f'n_elem2_{i}', 0)
+        inp1 = node.get_input_variable(node.inputs[0])
+        inp2 = node.get_input_variable(node.inputs[1])
+        for i, (s1, s2) in enumerate(zip(inp1.shape, inp2.shape)):
+            params[f'n_elem1_{i}'] = s1
+            params[f'n_elem2_{i}'] = s2
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py
new file mode 100755
index 0000000000..2dd982b5d4
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/pointwise.py
@@ -0,0 +1,92 @@
+from copy import copy
+
+import numpy as np
+
+from hls4ml.backends.catapult.passes.convolution_templates import (
+    Conv1DConfigTemplate,
+    Conv1DFunctionTemplate,
+    Conv2DConfigTemplate,
+    Conv2DFunctionTemplate,
+    conv1d_config_template,
+    conv2d_config_template,
+    conv_mult_config_template,
+)
+from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
+from hls4ml.model.layers import register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+pointwise_conv1d_function_template = (
+    'nnet::pointwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+pointwise_conv2d_function_template = (
+    'nnet::pointwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
+
+
+class PointwiseConv1DConfigTemplate(Conv1DConfigTemplate):
+    def __init__(self):
+        super(Conv1DConfigTemplate, self).__init__(PointwiseConv1D)
+        self.template = conv1d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(PointwiseConv1D, include_header=sepconv1d_include_list)
+        self.template = pointwise_conv1d_function_template
+
+
+class PointwiseConv2DConfigTemplate(Conv2DConfigTemplate):
+    def __init__(self):
+        super(Conv2DConfigTemplate, self).__init__(PointwiseConv2D)
+        self.template = conv2d_config_template
+        self.mult_template = conv_mult_config_template
+
+
+class PointwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(PointwiseConv2D, include_header=sepconv2d_include_list)
+        self.template = pointwise_conv2d_function_template
+
+
+def register_pointwise(backend):
+    # Register the layer types to the layer map
+    register_layer('PointwiseConv1D', PointwiseConv1D)
+    register_layer('PointwiseConv2D', PointwiseConv2D)
+
+    # Register the optimization passes
+    backend.register_pass('optimize_pointwise_conv', OptimizePointwiseConv)
+
+    # Register template passes
+    backend.register_template(PointwiseConv1DConfigTemplate)
+    backend.register_template(PointwiseConv1DFunctionTemplate)
+    backend.register_template(PointwiseConv2DConfigTemplate)
+    backend.register_template(PointwiseConv2DFunctionTemplate)
+
+
+class OptimizePointwiseConv(OptimizerPass):
+    def match(self, node):
+        return (
+            node.class_name in ('Conv1D', 'Conv2D')
+            and node.get_attr('filt_height', 1) == 1
+            and node.get_attr('filt_width') == 1
+        )
+
+    def transform(self, model, node):
+        dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+        pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy())
+        if len(node.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
+            expand_axis = tuple(range(int(dim[0])))
+            pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis)
+        pw_node.weights['bias'].data = node.weights['bias'].data
+        # Set strategy to ensure lowercase string is passed to the template
+        if model.config.is_resource_strategy(pw_node):
+            pw_node.set_attr('strategy', 'resource')
+        else:
+            pw_node.set_attr('strategy', 'latency')
+        model.replace_node(node, pw_node)
+
+        return True
diff --git a/hls4ml/backends/catapult/passes/pooling_templates.py b/hls4ml/backends/catapult/passes/pooling_templates.py
new file mode 100755
index 0000000000..77205a5df7
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/pooling_templates.py
@@ -0,0 +1,109 @@
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GlobalPooling1D, GlobalPooling2D, Pooling1D, Pooling2D
+
+# Pooling templates
+
+pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned pool_width = {pool_width};
+
+    static const unsigned filt_width = pool_width;
+    static const unsigned n_chan = n_filt;
+
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+    static const unsigned stride_width = {stride_width};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned reuse_factor = {reuse};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned stride_height = {stride_height};
+    static const unsigned stride_width = {stride_width};
+    static const unsigned pool_height = {pool_height};
+    static const unsigned pool_width = {pool_width};
+
+    static const unsigned filt_height = pool_height;
+    static const unsigned filt_width = pool_width;
+    static const unsigned n_chan = n_filt;
+
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+    static const bool count_pad = {count_pad};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
+    static const unsigned reuse_factor = {reuse};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    static const unsigned reuse_factor = {reuse};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_filt = {n_filt};
+    static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    static const unsigned reuse_factor = {reuse};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+global_pooling1d_function_template = (
+    'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+global_pooling2d_function_template = (
+    'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+)
+
+pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']
+
+
+class PoolingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D))
+        self.templates = {
+            'Pooling1D': pooling1d_config_template,
+            'Pooling2D': pooling2d_config_template,
+            'GlobalPooling1D': global_pooling1d_config_template,
+            'GlobalPooling2D': global_pooling2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class PoolingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Pooling1D, Pooling2D, GlobalPooling1D, GlobalPooling2D), include_header=pooling_include_list)
+        self.templates = {
+            'Pooling1D': pooling1d_function_template,
+            'Pooling2D': pooling2d_function_template,
+            'GlobalPooling1D': global_pooling1d_function_template,
+            'GlobalPooling2D': global_pooling2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+
+        return self.templates[node.class_name].format(**params)
diff --git a/hls4ml/backends/catapult/passes/quantization_templates.py b/hls4ml/backends/catapult/passes/quantization_templates.py
new file mode 100755
index 0000000000..7086b187f9
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/quantization_templates.py
@@ -0,0 +1,36 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.catapult.passes.core_templates import (
+    batchnorm_config_template,
+    batchnorm_function_template,
+    batchnorm_include_list,
+)
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.optimizer.passes.qkeras import ApplyAlpha
+
+
+class ApplyAlphaConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class ApplyAlphaFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(ApplyAlpha, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/recurrent_templates.py b/hls4ml/backends/catapult/passes/recurrent_templates.py
new file mode 100755
index 0000000000..4079f25721
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/recurrent_templates.py
@@ -0,0 +1,175 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import GRU, LSTM
+
+# recurrent multiplication template
+
+recr_mult_config_template = """struct config{index} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    static const bool store_weights_in_bram = false;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef {index_t.name} index_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+# activation templates
+
+activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+recr_activ_config_template = """struct {type}_config{index}_recr : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+# LSTM + GRU templates
+
+recr_config_template = """struct config{index} : nnet::{recr_type}_config {{
+    typedef {accum_t.name} accum_t;
+    typedef {weight_t.name} weight_t;  // Matrix
+    typedef {bias_t.name} bias_t;  // Vector
+    typedef {config_mult_t1} mult_config1;
+    typedef {config_mult_t2} mult_config2;
+    typedef {recr_act_t} ACT_CONFIG_{RECR_TYPE};
+    template<class x_T, class y_T, class config_T>
+    using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
+    typedef {act_t} ACT_CONFIG_T;
+    template<class x_T, class y_T, class config_T>
+    using activation = nnet::activation::{activation}<x_T, y_T, config_T>;
+    static const unsigned n_in  = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned n_state = {n_state};
+    static const unsigned n_sequence = {n_sequence};
+    static const unsigned n_sequence_out = {n_sequence_out};
+    static const unsigned io_type = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = {static};
+}};\n"""
+
+recr_function_template = 'nnet::{recr_type}_stack<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {wr}, {b}, {br});'
+
+recr_include_list = ['nnet_utils/nnet_recurrent.h']
+
+
+class RecurrentConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((LSTM, GRU))
+        self.template = recr_config_template
+        self.act_template = activ_config_template
+        self.recr_act_template = recr_activ_config_template
+        self.mult1_template = recr_mult_config_template
+        self.mult2_template = recr_mult_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        params['n_in'] = node.get_input_variable().dim_names[1]
+        params['n_sequence'] = node.get_input_variable().dim_names[0]
+        if node.get_attr('return_sequences'):
+            params['n_sequence_out'] = node.get_output_variable().dim_names[0]
+            params['n_state'] = node.get_output_variable().dim_names[1]
+            params['n_out'] = node.get_output_variable().dim_names[1]
+        else:
+            params['n_sequence_out'] = 1
+            params['n_state'] = node.get_output_variable().dim_names[0]
+            params['n_out'] = node.get_output_variable().dim_names[0]
+        params['config_mult_t1'] = f'config{node.index}_1'
+        params['config_mult_t2'] = f'config{node.index}_2'
+        params['recr_act_t'] = '{}_config{}_recr'.format(node.get_attr('recurrent_activation'), node.index)
+        params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+        params['strategy'] = node.get_attr('strategy')
+        params['static'] = 'true' if node.attributes['static'] else 'false'
+        params['recr_type'] = node.class_name.lower()
+        params['RECR_TYPE'] = node.class_name
+
+        if node.class_name == 'LSTM':
+            n_recr_mult = 4
+        else:  # GRU
+            n_recr_mult = 3
+
+        recr_config = self.template.format(**params)
+
+        act_params = self._default_config_params(node)
+        recr_act_params = self._default_config_params(node)
+
+        act_params['type'] = node.get_attr('activation')
+        recr_act_params['type'] = node.get_attr('recurrent_activation')
+        if node.get_attr('return_sequences'):
+            act_params['n_in'] = node.get_output_variable().dim_names[1]
+            recr_act_params['n_in'] = node.get_output_variable().dim_names[1] + ' * %i' % (n_recr_mult - 1)
+        else:
+            act_params['n_in'] = node.get_output_variable().dim_names[0]
+            recr_act_params['n_in'] = node.get_output_variable().dim_names[0] + ' * %i' % (n_recr_mult - 1)
+
+        act_config = self.act_template.format(**act_params)
+        recr_act_config = self.recr_act_template.format(**recr_act_params)
+
+        mult_params1 = self._default_config_params(node)
+        mult_params2 = self._default_config_params(node)
+
+        mult_params1['n_in'] = node.get_input_variable().dim_names[1]
+        if node.get_attr('return_sequences'):
+            mult_params1['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult
+        else:
+            mult_params1['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult
+        mult_params1['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+        mult_params1['reuse'] = params['reuse']
+        mult_params1['index'] = str(node.index) + '_1'
+        mult_params1['nzeros'] = node.get_weights('weight').nzeros
+        mult_params1['nonzeros'] = node.get_weights('weight').nonzeros
+        if node.get_attr('return_sequences'):
+            mult_params2['n_in'] = node.get_output_variable().dim_names[1]
+            mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult
+        else:
+            mult_params2['n_in'] = node.get_output_variable().dim_names[0]
+            mult_params2['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult
+        mult_params2['product_type'] = get_backend('catapult').product_type(
+            node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision
+        )
+        mult_params2['reuse'] = node.attributes['recurrent_reuse_factor']
+        mult_params2['index'] = str(node.index) + '_2'
+        mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros
+        mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros
+
+        mult_config1 = self.mult1_template.format(**mult_params1)
+        mult_config2 = self.mult2_template.format(**mult_params2)
+
+        return mult_config1 + '\n' + mult_config2 + '\n' + recr_act_config + '\n' + act_config + '\n' + recr_config
+
+
+class RecurrentFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((LSTM, GRU), include_header=recr_include_list)
+        self.template = recr_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['wr'] = node.get_weights('recurrent_weight').name
+        params['br'] = node.get_weights('recurrent_bias').name
+        params['activation'] = node.get_attr('activation')
+        params['recurrent_activation'] = node.get_attr('recurrent_activation')
+        params['recr_type'] = node.class_name.lower()
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/reshaping_templates.py b/hls4ml/backends/catapult/passes/reshaping_templates.py
new file mode 100755
index 0000000000..ec6705eb29
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/reshaping_templates.py
@@ -0,0 +1,132 @@
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Resize, Transpose, ZeroPadding1D, ZeroPadding2D
+
+# ZeroPadding templates
+
+zeropad1d_config_template = """struct config{index} : nnet::padding1d_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned out_width = {out_width};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad2d_config_template = """struct config{index} : nnet::padding2d_config {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+    static const unsigned pad_top = {pad_top};
+    static const unsigned pad_bottom = {pad_bottom};
+    static const unsigned pad_left = {pad_left};
+    static const unsigned pad_right = {pad_right};
+}};\n"""
+
+zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
+
+padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
+
+
+class ZeroPaddingConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D))
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_config_template,
+            'ZeroPadding2D': zeropad2d_config_template,
+        }
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        return self.templates[node.class_name].format(**params)
+
+
+class ZeroPaddingFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((ZeroPadding1D, ZeroPadding2D), include_header=padding_include_list)
+        self.templates = {
+            'ZeroPadding1D': zeropad1d_function_template,
+            'ZeroPadding2D': zeropad2d_function_template,
+        }
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['data_format'] = 'cf' if node.get_attr('data_format') == 'channels_first' else 'cl'
+
+        return self.templates[node.class_name].format(**params)
+
+
+# Resize templates
+
+resize_config_template = """struct config{index} : nnet::resize_config {{
+    static const unsigned height = {in_height};
+    static const unsigned width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned new_height = {out_height};
+    static const unsigned new_width = {out_width};
+}};\n"""
+
+resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});'
+
+resize_include_list = ['nnet_utils/nnet_image.h', 'nnet_utils/nnet_image_stream.h']
+
+
+class ResizeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Resize)
+        self.template = resize_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class ResizeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Resize, include_header=resize_include_list)
+        self.template = resize_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['algorithm'] = node.get_attr('algorithm')
+
+        return self.template.format(**params)
+
+
+# Transpose templates
+
+transpose_config_template = """struct config{index} : nnet::transpose_config {{
+    static const unsigned depth = {depth};
+    static const unsigned height = {height};
+    static const unsigned width = {width};
+    static constexpr unsigned perm[3] = {{{perm_str}}};
+}};\n"""
+
+transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
+
+transpose_include_list = ['nnet_utils/nnet_array.h', 'nnet_utils/nnet_stream.h']
+
+
+class TransposeConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Transpose)
+        self.template = transpose_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+
+        return self.template.format(**params)
+
+
+class TransposeFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Transpose, include_header=transpose_include_list)
+        self.template = transpose_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['dim'] = node.get_attr('dim')
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/catapult/passes/resource_strategy.py b/hls4ml/backends/catapult/passes/resource_strategy.py
new file mode 100755
index 0000000000..63e6e0b4db
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/resource_strategy.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ApplyResourceStrategy(OptimizerPass):
+    '''Transposes the weights to use the dense_resource matrix multiply routine'''
+
+    def match(self, node):
+        node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU))
+        is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource'
+        already_transformed = node.get_attr('_weights_transposed', False) is True
+
+        return node_matches and is_resource_strategy and not already_transformed
+
+    def transform(self, model, node):
+        if isinstance(node, Dense):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+        elif isinstance(node, Conv1D):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data, axes=[2, 0, 1])  # (W,C,F) => (F,W,C)
+        elif isinstance(node, SeparableConv1D):
+            node.weights['depthwise'].data = np.transpose(
+                node.weights['depthwise'].data, axes=[2, 0, 1]
+            )  # (W,C,F) => (F,W,C)
+            node.weights['pointwise'].data = np.transpose(
+                node.weights['pointwise'].data, axes=[2, 0, 1]
+            )  # (W,C,F) => (F,W,C)
+        elif isinstance(node, Conv2D):
+            node.weights['weight'].data = np.transpose(
+                node.weights['weight'].data, axes=[3, 0, 1, 2]
+            )  # (H,W,C,F) => (F,H,W,C)
+        elif isinstance(node, SeparableConv2D):
+            node.weights['depthwise'].data = np.transpose(
+                node.weights['depthwise'].data, axes=[3, 0, 1, 2]
+            )  # (H,W,C,F) => (F,H,W,C)
+            node.weights['pointwise'].data = np.transpose(
+                node.weights['pointwise'].data, axes=[3, 0, 1, 2]
+            )  # (H,W,C,F) => (F,H,W,C)
+        elif isinstance(node, (LSTM, GRU)):
+            node.weights['weight'].data = np.transpose(node.weights['weight'].data)
+            node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+        else:
+            raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
+
+        node.set_attr('_weights_transposed', True)
+
+        return False
diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py
new file mode 100755
index 0000000000..4ef3548cb6
--- /dev/null
+++ b/hls4ml/backends/catapult/passes/transform_types.py
@@ -0,0 +1,52 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ACTypeConverter,
+    CatapultArrayVariableConverter,
+    CatapultInplaceArrayVariableConverter,
+    CatapultInplaceStreamVariableConverter,
+    CatapultStreamVariableConverter,
+    HLSTypeConverter,
+    StaticWeightVariableConverter,
+)
+from hls4ml.model.optimizer import GlobalOptimizerPass
+from hls4ml.model.types import InplaceTensorVariable
+
+
+class TransformTypes(GlobalOptimizerPass):
+    def __init__(self):
+        self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
+        self.array_var_converter = CatapultArrayVariableConverter(type_converter=self.type_converter)
+        self.inplace_array_var_converter = CatapultInplaceArrayVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = CatapultStreamVariableConverter(type_converter=self.type_converter)
+        self.inplace_stream_var_converter = CatapultInplaceStreamVariableConverter(type_converter=self.type_converter)
+        self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
+
+    def transform(self, model, node):
+        io_type = node.model.config.get_config_value('IOType')
+
+        for out_name, var in node.variables.items():
+            if io_type == 'io_stream':
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_stream_var_converter.convert(var)
+                else:
+                    new_var = self.stream_var_converter.convert(var)
+            elif io_type == 'io_serial':
+                new_var = self.array_var_converter.convert(var, pragma='stream')
+            elif io_type == 'io_parallel':
+                if out_name in node.model.inputs:
+                    new_var = self.array_var_converter.convert(var, pragma='reshape')
+                elif isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_array_var_converter.convert(var, pragma='')
+                else:
+                    new_var = self.array_var_converter.convert(var, pragma='partition')
+            else:
+                raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.__class__.__name__})')
+
+            node.set_attr(out_name, new_var)
+
+        for w_name, weight in node.weights.items():
+            new_weight = self.weight_var_converter.convert(weight)
+            node.set_attr(w_name, new_weight)
+
+        for t_name, type in node.types.items():
+            new_type = self.type_converter.convert(type)
+            node.set_attr(t_name, new_type)
diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index c5327dab8c..408f1320e4 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -248,6 +248,13 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         )
 
 
+class CatapultArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
+        )
+
+
 class VivadoInplaceArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
@@ -258,6 +265,11 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
+class CatapultInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
 class ArrayVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -285,6 +297,11 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition)
 
 
+class CatapultArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition)
+
+
 class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
@@ -297,6 +314,13 @@ def __init__(self, type_converter):
         )
 
 
+class CatapultInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition
+        )
+
+
 # endregion
 
 # region StructMemberVariable
@@ -309,6 +333,13 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         )
 
 
+class CatapultStructMemberVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
 class StructMemberVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -338,6 +369,13 @@ def __init__(self, type_converter):
         )
 
 
+class CatapultStructMemberVariableConverter(StructMemberVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition
+        )
+
+
 # endregion
 
 # region StreamVariable
@@ -371,6 +409,21 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
+class CatapultStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration (string name arg not implemented in ac_channel)
+            return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format(
+                type=self.type.name, name=self.name, suffix=name_suffix
+            )
+
+
+class CatapultInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
 class StreamVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -402,6 +455,11 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
 
 
+class CatapultStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition)
+
+
 # endregion
 
 # region InplaceStreamVariable
@@ -435,6 +493,13 @@ def __init__(self, type_converter):
         )
 
 
+class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition
+        )
+
+
 # endregion
 
 # region WeightsVariable
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index b69dbec0f0..3bd6d06c3b 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -196,7 +196,7 @@ def convert_from_keras_model(
         output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be
             used during csim and cosim.
         backend (str, optional): Name of the backend to use, e.g., 'Vivado'
-            or 'Quartus'.
+            or 'Quartus' or 'Catapult'.
         board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default
             device of a backend will be used. See documentation of the backend used.
         part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used.
@@ -258,7 +258,7 @@ def convert_from_pytorch_model(
             used during csim and cosim. Defaults to None.
         output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be
             used during csim and cosim. Defaults to None.
-        backend (str, optional): Name of the backend to use, e.g., 'Vivado' or 'Quartus'. Defaults to 'Vivado'.
+        backend (str, optional): Name of the backend to use, e.g., 'Vivado' or 'Quartus' or 'Catapult'. Defaults to 'Vivado'.
         board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default
             device of a backend will be used. See documentation of the backend used.
         part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used.
@@ -332,7 +332,7 @@ def convert_from_onnx_model(
         output_data_tb (str, optional): String representing the path of output data in .npy or .dat format that will be
             used during csim and cosim.
         backend (str, optional): Name of the backend to use, e.g., 'Vivado'
-            or 'Quartus'.
+            or 'Quartus' or 'Catapult'.
         board (str, optional): One of target boards specified in `supported_board.json` file. If set to `None` a default
             device of a backend will be used. See documentation of the backend used.
         part (str, optional): The FPGA part. If set to `None` a default part of a backend will be used.
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index a6b5c29e89..04ec33294d 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -60,6 +60,12 @@ def get_config_value(self, key, default=None):
     def get_project_name(self):
         return self.get_config_value('ProjectName')
 
+    def get_project_dir(self):
+        if self.get_config_value('ProjectDir') is not None:
+            return self.get_config_value('ProjectDir')
+        else:
+            return self.get_config_value('ProjectName') + '_prj'
+
     def get_output_dir(self):
         return self.get_config_value('OutputDir')
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index b74918f642..de191baa40 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -560,6 +560,7 @@ def initialize(self):
         if self.model.config.is_resource_strategy(self) and self.model.config.backend.name in [
             'Vivado',
             'VivadoAccelerator',
+            'Catapult',
         ]:
             self.weights['weight'].data_unquantized = np.transpose(folded_weights, axes=[3, 0, 1, 2])
             self.weights['weight'].data = self.get_attr('weight_quantizer')(self.weights['weight'].data_unquantized)
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 84a83de23e..9560699405 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -588,8 +588,9 @@ def get_ymodel_keras(keras_model, X):
         # Note that if the layer is a standalone activation layer then skip this
         name = layer.name
         if (
-            hasattr(layer, 'activation')
-            and layer.activation is not None
+            hasattr(layer, "activation")
+            and hasattr(layer.activation, "__name__")
+            and layer.activation.__name__ != "linear"
             and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
             and layer.activation.__name__ != 'linear'
         ):
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index b73558f6ee..3c9b7707b7 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -1,3 +1,6 @@
+from hls4ml.report.catapult_report import parse_catapult_report  # noqa: F401
+from hls4ml.report.catapult_report import qofr  # noqa: F401
+from hls4ml.report.catapult_report import read_catapult_report  # noqa: F401
 from hls4ml.report.quartus_report import parse_quartus_report  # noqa: F401
 from hls4ml.report.quartus_report import read_quartus_report  # noqa: F401
 from hls4ml.report.vivado_report import parse_vivado_report  # noqa: F401
diff --git a/hls4ml/report/catapult_report.py b/hls4ml/report/catapult_report.py
new file mode 100755
index 0000000000..563a3a7594
--- /dev/null
+++ b/hls4ml/report/catapult_report.py
@@ -0,0 +1,256 @@
+import os
+import re
+
+import yaml
+
+
+def read_catapult_report(hls_dir, full_report=False):
+    if not os.path.exists(hls_dir):
+        print(f'Path {hls_dir} does not exist. Exiting.')
+        return
+
+    prj_dir = None
+    top_func_name = None
+
+    if os.path.isfile(hls_dir + '/build_prj.tcl'):
+        prj_dir, top_func_name = _parse_build_script(hls_dir + '/build_prj.tcl')
+        print('Prj Dir:', prj_dir)
+        print('Top func name:', top_func_name)
+
+    if prj_dir is None or top_func_name is None:
+        print('Unable to read project data. Exiting.')
+        return
+
+    sln_dir = hls_dir + '/' + prj_dir
+    if not os.path.exists(sln_dir):
+        print(f'Project {prj_dir} does not exist. Rerun "hls4ml build -p {hls_dir}".')
+        return
+
+    solutions = _find_solutions(sln_dir, hls_dir)
+
+    for sln in solutions:
+        print(f'Reports for solution "{sln}":\n')
+        _find_reports(sln_dir + '/' + sln, top_func_name, full_report)
+
+
+def _parse_build_script(script_path):
+    prj_dir = None
+    top_func_name = None
+
+    with open(script_path) as f:
+        for line in f.readlines():
+            if 'project new' in line:
+                prj_dir = line.split()[-1]
+            if 'set design_top' in line:
+                top_func_name = line.split()[-1]
+
+    return prj_dir, top_func_name
+
+
+def _find_solutions(sln_dir, hls_dir):
+    solutions = []
+    prj_dir, top_func_name = _parse_build_script(hls_dir + '/build_prj.tcl')
+    for path in os.listdir(sln_dir):
+        # check if current path is a dir
+        if os.path.isdir(os.path.join(sln_dir, path)):
+            pathstring = str(path)
+            if top_func_name in pathstring:
+                solutions.append(pathstring)
+    return solutions
+
+
+def _find_reports(sln_dir, top_func_name, full_report=False):
+    csim_file = sln_dir + '/../../tb_data/csim_results.log'
+    if os.path.isfile(csim_file):
+        _show_csim_report(csim_file)
+    else:
+        print('C simulation report not found.')
+
+    syn_file = sln_dir + '/rtl.rpt'
+    if os.path.isfile(syn_file):
+        _show_synth_report(syn_file, full_report)
+    else:
+        print('Synthesis report not found.')
+
+    cosim_file = sln_dir + f'/sim/report/{top_func_name}_cosim.rpt'
+    if os.path.isfile(cosim_file):
+        _show_cosim_report(cosim_file)
+    else:
+        print('Co-simulation report not found.')
+
+    timing_report = sln_dir + '/vivado_concat_v/timing_summary_synth.rpt'
+    if os.path.isfile(timing_report):
+        _show_timing_report(timing_report)
+    else:
+        print('Timing synthesis report not found.')
+
+    utilization_report = sln_dir + '/vivado_concat_v/utilization_synth.rpt'
+    if os.path.isfile(utilization_report):
+        _show_utilization_report(utilization_report)
+    else:
+        print('Utilization synthesis report not found.')
+
+
+def _show_csim_report(csim_file):
+    with open(csim_file) as f:
+        print('C SIMULATION RESULT:')
+        print(f.read())
+
+
+def _show_synth_report(synth_file, full_report=False):
+    with open(synth_file) as f:
+        print('SYNTHESIS REPORT:')
+        for line in f.readlines()[2:]:
+            if not full_report and '* DSP48' in line:
+                break
+            print(line, end='')
+
+
+def _show_cosim_report(cosim_file):
+    with open(cosim_file) as f:
+        print('CO-SIMULATION RESULT:')
+        print(f.read())
+
+
+def _show_timing_report(timing_report):
+    with open(timing_report) as f:
+        print('TIMING REPORT:')
+        print(f.read())
+
+
+def _show_utilization_report(utilization_report):
+    with open(utilization_report) as f:
+        print('UTILIZATION REPORT:')
+        print(f.read())
+
+
+def _get_abs_and_percentage_values(unparsed_cell):
+    return int(unparsed_cell.split('(')[0]), float(unparsed_cell.split('(')[1].replace('%', '').replace(')', ''))
+
+
+def parse_catapult_report(output_dir):
+    if not os.path.exists(output_dir):
+        print(f'Project OutputDir {output_dir} does not exist. Exiting.')
+        return
+
+    # Read the YAML config file to determine the project settings
+    with open(output_dir + '/hls4ml_config.yml') as yfile:
+        ydata = yaml.safe_load(yfile)
+
+    if not ydata['ProjectDir'] is None:
+        ProjectDir = ydata['ProjectDir']
+    else:
+        ProjectDir = ydata['ProjectName'] + '_prj'
+    ProjectName = ydata['ProjectName']
+
+    sln_dir = output_dir + '/' + ProjectDir
+    if not os.path.exists(sln_dir):
+        print(f'Project {ProjectDir} does not exist. Rerun "hls4ml build -p {output_dir}".')
+        return
+
+    solutions = _find_solutions(sln_dir, output_dir)
+    if len(solutions) > 1:
+        print(f'WARNING: Found {len(solutions)} solution(s) in {sln_dir}. Using the first solution.')
+
+    report = {}
+
+    sim_file = output_dir + '/tb_data/csim_results.log'
+    if os.path.isfile(sim_file):
+        csim_results = []
+        with open(sim_file) as f:
+            for line in f.readlines():
+                csim_results.append([r for r in line.split()])
+        report['CSimResults'] = csim_results
+
+    util_report_file = output_dir + '/' + ProjectDir + '/' + solutions[0] + '/vivado_concat_v/utilization_synth.rpt'
+    if os.path.isfile(util_report_file):
+        util_report = {}
+        a = 0
+        with open(util_report_file) as f:
+            for line in f.readlines():
+                # Sometimes, phrases such as 'CLB Registers' can show up in the non-tabular sections of the report
+                if '|' in line:
+                    if ('CLB LUTs' in line) and (a == 0):
+                        a += 1
+                        util_report['LUT'] = line.split('|')[2].strip()
+                    elif ('CLB Registers' in line) and (a == 1):
+                        a += 1
+                        util_report['FF'] = line.split('|')[2].strip()
+                    elif ('RAMB18 ' in line) and (a == 2):
+                        a += 1
+                        util_report['BRAM_18K'] = line.split('|')[2].strip()
+                    elif ('DSPs' in line) and (a == 3):
+                        a += 1
+                        util_report['DSP48E'] = line.split('|')[2].strip()
+                    elif ('URAM' in line) and (a == 4):
+                        a += 1
+                        util_report['URAM'] = line.split('|')[2].strip()
+        report['UtilizationReport'] = util_report
+    else:
+        print('Utilization report not found.')
+
+    timing_report_file = output_dir + '/' + ProjectDir + '/' + solutions[0] + '/vivado_concat_v/timing_summary_synth.rpt'
+    if os.path.isfile(timing_report_file):
+        timing_report = {}
+        with open(timing_report_file) as f:
+            while not re.search('WNS', next(f)):
+                pass
+            # skip the successive line
+            next(f)
+            result = next(f).split()
+
+        timing_report['WNS'] = float(result[0])
+        timing_report['TNS'] = float(result[1])
+        timing_report['WHS'] = float(result[4])
+        timing_report['THS'] = float(result[5])
+        timing_report['WPWS'] = float(result[8])
+        timing_report['TPWS'] = float(result[9])
+
+        report['TimingReport'] = timing_report
+    else:
+        print('Timing report not found.')
+
+    latest_prj_dir = get_latest_project_prj_directory(output_dir, ProjectDir)
+    latest_ver_dir = get_latest_project_version_directory(latest_prj_dir, ProjectName)
+    file_path = os.path.join(latest_ver_dir, 'nnet_layer_results.txt')
+    print('Results in nnet_layer_results.txt from:', file_path)
+
+    # Initialize the array
+    report['PerLayerQOFR'] = []
+    # Open the file and read its contents
+    with open(file_path) as file:
+        # Read each line and append it to the list
+        for line in file:
+            report['PerLayerQOFR'].append(line.strip())  # strip() removes leading/trailing
+
+    return report
+
+
+def get_latest_project_version_directory(base_path, ProjectName):
+    versions = [d for d in os.listdir(base_path) if d.startswith(ProjectName + '.v')]
+    if not versions:
+        raise FileNotFoundError('Error: No versions found.')
+    latest_version = max(versions)
+    return os.path.join(base_path, latest_version)
+
+
+def get_latest_project_prj_directory(base_path, ProjectDir):
+    versions = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.startswith(ProjectDir)]
+    if not versions:
+        raise FileNotFoundError('Error: No versions found.')
+    latest_version = max(versions)
+    return os.path.join(base_path, latest_version)
+
+
+def qofr(report):
+    # Access the PerLayerQOFR list from the report dictionary
+    PerLayerQOFR = report.get('PerLayerQOFR', [])
+
+    # Check if the list is not empty
+    if PerLayerQOFR:
+        # print('Results in nnet_layer_results.txt:')
+        # Iterate over each line in the list and print it
+        for line in PerLayerQOFR:
+            print(line)
+    else:
+        print('No results found in nnet_layer_results.txt')
diff --git a/hls4ml/templates/catapult/ac_math b/hls4ml/templates/catapult/ac_math
new file mode 160000
index 0000000000..3696be957d
--- /dev/null
+++ b/hls4ml/templates/catapult/ac_math
@@ -0,0 +1 @@
+Subproject commit 3696be957d0b0fa0a285f90382d75c8a521d77ee
diff --git a/hls4ml/templates/catapult/ac_simutils b/hls4ml/templates/catapult/ac_simutils
new file mode 160000
index 0000000000..9dfe23415c
--- /dev/null
+++ b/hls4ml/templates/catapult/ac_simutils
@@ -0,0 +1 @@
+Subproject commit 9dfe23415cf670ed7c990d9a6a237d06a5a62e57
diff --git a/hls4ml/templates/catapult/ac_types b/hls4ml/templates/catapult/ac_types
new file mode 160000
index 0000000000..134dcb1a05
--- /dev/null
+++ b/hls4ml/templates/catapult/ac_types
@@ -0,0 +1 @@
+Subproject commit 134dcb1a05e16f242de593b9c9a33f6aa08c66e6
diff --git a/hls4ml/templates/catapult/build_lib.sh b/hls4ml/templates/catapult/build_lib.sh
new file mode 100755
index 0000000000..2c7a11c626
--- /dev/null
+++ b/hls4ml/templates/catapult/build_lib.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "linux"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique -Wno-pragmas"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+LDFLAGS=
+
+# Pick up AC libraries from Catapult install first
+INCFLAGS="-I$MGC_HOME/shared/include -I$MGC_HOME/shared/include/nnet_utils -Ifirmware/ac_types/include -Ifirmware/ac_math/include -Ifirmware/ac_simutils/include -Ifirmware/nnet_utils"
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/catapult/build_prj.tcl b/hls4ml/templates/catapult/build_prj.tcl
new file mode 100755
index 0000000000..7ee4d640dd
--- /dev/null
+++ b/hls4ml/templates/catapult/build_prj.tcl
@@ -0,0 +1,356 @@
+#################
+#    HLS4ML
+#################
+array set opt {
+  reset      0
+  csim       0
+  synth      1
+  cosim      0
+  validation 0
+  vhdl       1
+  verilog    1
+  export     0
+  vsynth     0
+  bitfile    0
+  fifo_opt   0
+  ran_frame  2
+  sw_opt     0
+  power      0
+  da         0
+  bup        0
+}
+
+# Get pathname to this script to use as dereference path for relative file pathnames
+set sfd [file dirname [info script]]
+
+if { [info exists ::argv] } {
+  foreach arg $::argv {
+    foreach {optname optval} [split $arg '='] {}
+    if { [info exists opt($optname)] } {
+      if {[string is integer -strict $optval]} {
+        set opt($optname) $optval
+      } else {
+        set opt($optname) [string is true -strict $optval]
+      }
+    }
+  }
+}
+
+puts "***** INVOKE OPTIONS *****"
+foreach x [lsort [array names opt]] {
+  puts "[format {   %-20s %s} $x $opt($x)]"
+}
+puts ""
+
+proc report_time { op_name time_start time_end } {
+  set time_taken [expr $time_end - $time_start]
+  set time_s [expr ($time_taken / 1000) % 60]
+  set time_m [expr ($time_taken / (1000*60)) % 60]
+  set time_h [expr ($time_taken / (1000*60*60)) % 24]
+  puts "***** ${op_name} COMPLETED IN ${time_h}h${time_m}m${time_s}s *****"
+}
+
+proc setup_xilinx_part { part } {
+  # Map Xilinx PART into Catapult library names
+  set part_sav $part
+  set libname [lindex [library get /CONFIG/PARAMETERS/Vivado/PARAMETERS/Xilinx/PARAMETERS/*/PARAMETERS/*/PARAMETERS/$part/LIBRARIES/*/NAME -match glob -ret v] 0]
+  puts "Library Name: $libname"
+  if { [llength $libname] == 1 } {
+    set libpath [library get /CONFIG/PARAMETERS/Vivado/PARAMETERS/Xilinx/PARAMETERS/*/PARAMETERS/*/PARAMETERS/$part/LIBRARIES/*/NAME -match glob -ret p]
+    puts "Library Path: $libpath"
+    if { [regexp {/CONFIG/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/PARAMETERS/(\S+)/.*} $libpath dummy rtltool vendor family speed part] } {
+      solution library add $libname -- -rtlsyntool $rtltool -vendor $vendor -family $family -speed $speed -part $part_sav
+    } else {
+      solution library add $libname -- -rtlsyntool Vivado
+    }
+  } else {
+    logfile message "Could not find specific Xilinx base library for part '$part'. Using KINTEX-u\n" warning
+    solution library add mgc_Xilinx-KINTEX-u-2_beh -- -rtlsyntool Vivado -manufacturer Xilinx -family KINTEX-u -speed -2 -part xcku115-flvb2104-2-i
+  }
+  solution library add Xilinx_RAMS
+  solution library add Xilinx_ROMS
+  solution library add Xilinx_FIFO
+}
+
+
+proc setup_asic_libs { args } {
+  set do_saed 0
+  foreach lib $args {
+    solution library add $lib -- -rtlsyntool DesignCompiler
+    if { [lsearch -exact {saed32hvt_tt0p78v125c_beh saed32lvt_tt0p78v125c_beh saed32rvt_tt0p78v125c_beh} $lib] != -1 } {
+      set do_saed 1
+    }
+  }
+  solution library add ccs_sample_mem
+  solution library add ccs_sample_rom
+  solution library add hls4ml_lib
+  go libraries
+
+  # special exception for SAED32 for use in power estimation
+  if { $do_saed } {
+    # SAED32 selected - enable DC settings to access Liberty data for power estimation
+    source [application get /SYSTEM/ENV_MGC_HOME]/pkgs/siflibs/saed/setup_saedlib.tcl
+  }
+}
+
+options set Input/CppStandard {c++17}
+options set Input/CompilerFlags -DRANDOM_FRAMES=$opt(ran_frame)
+options set Input/SearchPath {$MGC_HOME/shared/include/nnet_utils} -append
+options set ComponentLibs/SearchPath {$MGC_HOME/shared/pkgs/ccs_hls4ml} -append
+
+if {$opt(reset)} {
+  project load CATAPULT_DIR.ccs
+  go new
+} else {
+  project new -name CATAPULT_DIR
+}
+
+#--------------------------------------------------------
+# Configure Catapult Options
+# downgrade HIER-10
+options set Message/ErrorOverride HIER-10 -remove
+solution options set Message/ErrorOverride HIER-10 -remove
+
+if {$opt(vhdl)}    {
+  options set Output/OutputVHDL true
+} else {
+  options set Output/OutputVHDL false
+}
+if {$opt(verilog)} {
+  options set Output/OutputVerilog true
+} else {
+  options set Output/OutputVerilog false
+}
+
+#--------------------------------------------------------
+# Configure Catapult Flows
+if { [info exists ::env(XILINX_PCL_CACHE)] } {
+options set /Flows/Vivado/PCL_CACHE $::env(XILINX_PCL_CACHE)
+solution options set /Flows/Vivado/PCL_CACHE $::env(XILINX_PCL_CACHE)
+}
+
+# Turn on HLS4ML flow (wrapped in a cache so that older Catapult installs still work)
+catch {flow package require /HLS4ML}
+
+# Turn on SCVerify flow
+flow package require /SCVerify
+#  flow package option set /SCVerify/INVOKE_ARGS {$sfd/firmware/weights $sfd/tb_data/tb_input_features.dat $sfd/tb_data/tb_output_predictions.dat}
+#hls-fpga-machine-learning insert invoke_args
+
+# Turn on VSCode flow
+# flow package require /VSCode
+# To launch VSCode on the C++ HLS design:
+#   cd my-Catapult-test
+#   code Catapult.code-workspace
+
+#--------------------------------------------------------
+#    Start of HLS script
+set design_top myproject
+solution file add $sfd/firmware/myproject.cpp
+solution file add $sfd/myproject_test.cpp -exclude true
+
+# Parse parameters.h to determine config info to control directives/pragmas
+set IOType io_stream
+if { ![file exists $sfd/firmware/parameters.h] } {
+  logfile message "Could not locate firmware/parameters.h. Unable to determine network configuration.\n" warning
+} else {
+  set pf [open "$sfd/firmware/parameters.h" "r"]
+  while {![eof $pf]} {
+    gets $pf line
+    if { [string match {*io_type = nnet::io_stream*} $line] } {
+      set IOType io_stream
+      break
+    }
+  }
+  close $pf
+}
+
+if { $IOType == "io_stream" } {
+solution options set Architectural/DefaultRegisterThreshold 2050
+}
+directive set -RESET_CLEARS_ALL_REGS no
+# Constrain arrays to map to memory only over a certain size
+directive set -MEM_MAP_THRESHOLD [expr 2048 * 16 + 1]
+# The following line gets modified by the backend writer
+set hls_clock_period 5
+
+go analyze
+
+# NORMAL TOP DOWN FLOW
+if { ! $opt(bup) } {
+
+go compile
+
+if {$opt(csim)} {
+  puts "***** C SIMULATION *****"
+  set time_start [clock clicks -milliseconds]
+  flow run /SCVerify/launch_make ./scverify/Verify_orig_cxx_osci.mk {} SIMTOOL=osci sim
+  set time_end [clock clicks -milliseconds]
+  report_time "C SIMULATION" $time_start $time_end
+}
+
+puts "***** SETTING TECHNOLOGY LIBRARIES *****"
+#hls-fpga-machine-learning insert techlibs
+
+directive set -CLOCKS [list clk [list -CLOCK_PERIOD $hls_clock_period -CLOCK_EDGE rising -CLOCK_OFFSET 0.000000 -CLOCK_UNCERTAINTY 0.0 -RESET_KIND sync -RESET_SYNC_NAME rst -RESET_SYNC_ACTIVE high -RESET_ASYNC_NAME arst_n -RESET_ASYNC_ACTIVE low -ENABLE_NAME {} -ENABLE_ACTIVE high]]
+
+if {$opt(synth)} {
+  puts "***** C/RTL SYNTHESIS *****"
+  set time_start [clock clicks -milliseconds]
+
+  go assembly
+
+  go architect
+
+  go allocate
+
+  go schedule
+
+  go extract
+  set time_end [clock clicks -milliseconds]
+  report_time "C/RTL SYNTHESIS" $time_start $time_end
+}
+
+# BOTTOM-UP FLOW
+} else {
+  # Start at 'go analyze'
+  go analyze
+
+  # Build the design bottom-up
+  directive set -CLOCKS [list clk [list -CLOCK_PERIOD $hls_clock_period -CLOCK_EDGE rising -CLOCK_OFFSET 0.000000 -CLOCK_UNCERTAINTY 0.0 -RESET_KIND sync -RESET_SYNC_NAME rst -RESET_SYNC_ACTIVE high -RESET_ASYNC_NAME arst_n -RESET_ASYNC_ACTIVE low -ENABLE_NAME {} -ENABLE_ACTIVE high]]
+
+  set blocks [solution get /HIERCONFIG/USER_HBS/*/RESOLVED_NAME -match glob -rec 1 -ret v -state analyze]
+  set bu_mappings {}
+  set top [lindex $blocks 0]
+  foreach block [lreverse [lrange $blocks 1 end]] {
+    # skip blocks that are net nnet:: functions
+    if { [string match {nnet::*} $block] == 0 } { continue }
+    go analyze
+    solution design set $block -top
+    go compile
+    solution library remove *
+    puts "***** SETTING TECHNOLOGY LIBRARIES *****"
+#hls-fpga-machine-learning insert techlibs
+    go extract
+    set block_soln "[solution get /TOP/name -checkpath 0].[solution get /VERSION -checkpath 0]"
+    lappend bu_mappings [solution get /CAT_DIR] /$top/$block "\[Block\] $block_soln"
+  }
+
+  # Move to top design
+  go analyze
+  solution design set $top -top
+  go compile
+
+  if {$opt(csim)} {
+    puts "***** C SIMULATION *****"
+    set time_start [clock clicks -milliseconds]
+    flow run /SCVerify/launch_make ./scverify/Verify_orig_cxx_osci.mk {} SIMTOOL=osci sim
+    set time_end [clock clicks -milliseconds]
+    report_time "C SIMULATION" $time_start $time_end
+  }
+  foreach {d i l} $bu_mappings {
+    logfile message "solution options set ComponentLibs/SearchPath $d -append\n" info
+    solution options set ComponentLibs/SearchPath $d -append
+  }
+
+  # Add bottom-up blocks
+  puts "***** SETTING TECHNOLOGY LIBRARIES *****"
+  solution library remove *
+#hls-fpga-machine-learning insert techlibs
+  # need to revert back to go compile
+  go compile
+  foreach {d i l} $bu_mappings {
+    logfile message "solution library add [list $l]\n" info
+    eval solution library add [list $l]
+  }
+  go libraries
+
+  # Map to bottom-up blocks
+  foreach {d i l} $bu_mappings {
+    # Make sure block exists
+    set cnt [directive get $i/* -match glob -checkpath 0 -ret p]
+    if { $cnt != {} } {
+      logfile message "directive set $i -MAP_TO_MODULE [list $l]\n" info
+      eval directive set $i -MAP_TO_MODULE [list $l]
+    }
+  }
+  go assembly
+  set design [solution get -name]
+  logfile message "Adjusting FIFO_DEPTH for top-level interconnect channels\n" warning
+  # FIFO interconnect between layers
+  foreach ch_fifo_m2m [directive get -match glob -checkpath 0 -ret p $design/*_out:cns/MAP_TO_MODULE] {
+    set ch_fifo [join [lrange [split $ch_fifo_m2m '/'] 0 end-1] /]/FIFO_DEPTH
+    logfile message "directive set -match glob $ch_fifo 1\n" info
+    directive set -match glob "$ch_fifo" 1
+  }
+  # For bypass paths - the depth will likely need to be larger than 1
+  foreach ch_fifo_m2m [directive get -match glob -checkpath 0 -ret p $design/*_cpy*:cns/MAP_TO_MODULE] {
+    set ch_fifo [join [lrange [split $ch_fifo_m2m '/'] 0 end-1] /]/FIFO_DEPTH
+    logfile message "Bypass FIFO '$ch_fifo' depth set to 1 - larger value may be required to prevent deadlock\n" warning
+    logfile message "directive set -match glob $ch_fifo 1\n" info
+    directive set -match glob "$ch_fifo" 1
+  }
+  go architect
+  go allocate
+  go schedule
+  go dpfsm
+  go extract
+}
+
+project save
+
+if {$opt(cosim) || $opt(validation)} {
+  if {$opt(verilog)} {
+    flow run /SCVerify/launch_make ./scverify/Verify_rtl_v_msim.mk {} SIMTOOL=msim sim
+  }
+  if {$opt(vhdl)} {
+    flow run /SCVerify/launch_make ./scverify/Verify_rtl_vhdl_msim.mk {} SIMTOOL=msim sim
+  }
+}
+
+if {$opt(export)} {
+  puts "***** EXPORT IP *****"
+  set time_start [clock clicks -milliseconds]
+# Not yet implemented. Do we need to include value of $version ?
+#  flow package option set /Vivado/BoardPart xilinx.com:zcu102:part0:3.1
+#  flow package option set /Vivado/IP_Taxonomy {/Catapult}
+#  flow run /Vivado/launch_package_ip -shell ./vivado_concat_v/concat_v_package_ip.tcl
+  set time_end [clock clicks -milliseconds]
+  report_time "EXPORT IP" $time_start $time_end
+}
+if {$opt(sw_opt)} {
+  puts "***** Pre Power Optimization *****"
+  go switching
+  if {$opt(verilog)} {
+    flow run /PowerAnalysis/report_pre_pwropt_Verilog
+  }
+  if {$opt(vhdl)} {
+    flow run /PowerAnalysis/report_pre_pwropt_VHDL
+  }
+}
+
+if {$opt(power)} {
+  puts "***** Power Optimization *****"
+  go power
+}
+
+if {$opt(vsynth)} {
+  puts "***** VIVADO SYNTHESIS *****"
+  set time_start [clock clicks -milliseconds]
+  flow run /Vivado/synthesize -shell vivado_concat_v/concat_rtl.v.xv
+  set time_end [clock clicks -milliseconds]
+  report_time "VIVADO SYNTHESIS" $time_start $time_end
+}
+
+if {$opt(bitfile)} {
+  puts "***** Option bitfile not supported yet *****"
+}
+
+if {$opt(da)} {
+  puts "***** Launching DA *****"
+  flow run /DesignAnalyzer/launch
+}
+
+if { [catch {flow package present /HLS4ML}] == 0 } {
+  flow run /HLS4ML/collect_reports
+}
diff --git a/hls4ml/templates/catapult/catapult_synth.tcl b/hls4ml/templates/catapult/catapult_synth.tcl
new file mode 100644
index 0000000000..6d80a33ef5
--- /dev/null
+++ b/hls4ml/templates/catapult/catapult_synth.tcl
@@ -0,0 +1,3 @@
+add_files myproject_prj/solution1/syn/vhdl
+synth_design -top myproject -part xcku115-flvb2104-2-i
+report_utilization -file vivado_synth.rpt
diff --git a/hls4ml/templates/catapult/firmware/defines.h b/hls4ml/templates/catapult/firmware/defines.h
new file mode 100755
index 0000000000..c5601779e4
--- /dev/null
+++ b/hls4ml/templates/catapult/firmware/defines.h
@@ -0,0 +1,15 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+#include "nnet_utils/nnet_types.h"
+#include <ac_channel.h>
+#include <ac_fixed.h>
+#include <ac_int.h>
+#include <cstddef>
+#include <cstdio>
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#endif
diff --git a/hls4ml/templates/catapult/firmware/myproject.cpp b/hls4ml/templates/catapult/firmware/myproject.cpp
new file mode 100755
index 0000000000..bdb0570f8b
--- /dev/null
+++ b/hls4ml/templates/catapult/firmware/myproject.cpp
@@ -0,0 +1,29 @@
+#include <iostream>
+
+#include "myproject.h"
+#include "parameters.h"
+
+#include <mc_scverify.h>
+
+#pragma hls_design top
+// hls-fpga-machine-learning insert IFSynPragmas
+void CCS_BLOCK(myproject)(
+    // hls-fpga-machine-learning insert header
+) {
+
+    // hls-fpga-machine-learning insert IO
+
+#ifndef __SYNTHESIS__
+    static bool loaded_weights = false;
+    if (!loaded_weights) {
+        // hls-fpga-machine-learning insert load weights
+        loaded_weights = true;
+    }
+#endif
+
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning insert layers
+}
diff --git a/hls4ml/templates/catapult/firmware/myproject.h b/hls4ml/templates/catapult/firmware/myproject.h
new file mode 100755
index 0000000000..dd73c3e807
--- /dev/null
+++ b/hls4ml/templates/catapult/firmware/myproject.h
@@ -0,0 +1,15 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include <ac_channel.h>
+#include <ac_fixed.h>
+#include <ac_int.h>
+
+#include "defines.h"
+
+// Prototype of top level function for C-synthesis
+void myproject(
+    // hls-fpga-machine-learning insert header
+);
+
+#endif
diff --git a/hls4ml/templates/catapult/firmware/parameters.h b/hls4ml/templates/catapult/firmware/parameters.h
new file mode 100755
index 0000000000..2915c145c8
--- /dev/null
+++ b/hls4ml/templates/catapult/firmware/parameters.h
@@ -0,0 +1,15 @@
+#ifndef PARAMETERS_H_
+#define PARAMETERS_H_
+
+#include <ac_fixed.h>
+#include <ac_int.h>
+
+#include "nnet_utils/nnet_code_gen.h"
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+
+// hls-fpga-machine-learning insert weights
+
+// hls-fpga-machine-learning insert layer-config
+
+#endif
diff --git a/hls4ml/templates/catapult/myproject_bridge.cpp b/hls4ml/templates/catapult/myproject_bridge.cpp
new file mode 100755
index 0000000000..f1326a1faf
--- /dev/null
+++ b/hls4ml/templates/catapult/myproject_bridge.cpp
@@ -0,0 +1,72 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/myproject.h"
+#include "nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+static std::string s_weights_dir = "weights";
+
+const char *get_weights_dir() { return s_weights_dir.c_str(); }
+
+// hls-fpga-machine-learning insert bram
+
+// hls-fpga-machine-learning insert declare weights
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif
diff --git a/hls4ml/templates/catapult/myproject_test.cpp b/hls4ml/templates/catapult/myproject_test.cpp
new file mode 100755
index 0000000000..66b87f6741
--- /dev/null
+++ b/hls4ml/templates/catapult/myproject_test.cpp
@@ -0,0 +1,164 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+
+static std::string s_weights_dir;
+
+const char *get_weights_dir() { return s_weights_dir.c_str(); }
+
+#include "firmware/myproject.h"
+#include "nnet_utils/nnet_helpers.h"
+// #include "firmware/parameters.h"
+
+#include <mc_scverify.h>
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+#ifndef RANDOM_FRAMES
+#define RANDOM_FRAMES 1
+#endif
+
+// hls-fpga-machine-learning insert declare weights
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+CCS_MAIN(int argc, char *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Error - too few arguments" << std::endl;
+        std::cerr << "Usage: " << argv[0] << " <weights_dir> <tb_input_features> <tb_output_predictions>" << std::endl;
+        std::cerr << "Where: <weights_dir>           - string pathname to directory containing wN.txt and bN.txt files"
+                  << std::endl;
+        std::cerr << "       <tb_input_features>     - string pathname to tb_input_features.dat (optional)" << std::endl;
+        std::cerr << "       <tb_output_predictions> - string pathname to tb_output_predictions.dat (optional)" << std::endl;
+        std::cerr << std::endl;
+        std::cerr << "If no testbench input/prediction data provided, random input data will be generated" << std::endl;
+        CCS_RETURN(-1);
+    }
+    s_weights_dir = argv[1];
+    std::cout << "  Weights directory: " << s_weights_dir << std::endl;
+
+    std::string tb_in;
+    std::string tb_out;
+    std::ifstream fin;
+    std::ifstream fpr;
+    bool use_random = false;
+    if (argc == 2) {
+        std::cout << "No testbench files provided - Using random input data" << std::endl;
+        use_random = true;
+    } else {
+        tb_in = argv[2];
+        tb_out = argv[3];
+        std::cout << "  Test Feature Data: " << tb_in << std::endl;
+        std::cout << "  Test Predictions : " << tb_out << std::endl;
+
+        // load input data from text file
+        fin.open(tb_in);
+        // load predictions from text file
+        fpr.open(tb_out);
+        if (!fin.is_open() || !fpr.is_open()) {
+            use_random = true;
+        }
+    }
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+#ifndef __SYNTHESIS__
+    static bool loaded_weights = false;
+    if (!loaded_weights) {
+        // hls-fpga-machine-learning insert load weights
+        loaded_weights = true;
+    }
+#endif
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (!use_random) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            //    std::cout << "    Input feature map size = " << in.size() << " Output predictions size = " << pr.size() <<
+            //    std::endl;
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        if (fin.is_open()) {
+            fin.close();
+        }
+        if (fpr.is_open()) {
+            fpr.close();
+        }
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file(s) so feeding random values" << std::endl;
+        std::cout << "Number of Frames Passed from the tcl= " << RANDOM_FRAMES << std::endl;
+
+        if (RANDOM_FRAMES > 0) {
+            for (unsigned int k = 0; k < RANDOM_FRAMES; k++) {
+                // hls-fpga-machine-learning insert random
+
+                // hls-fpga-machine-learning insert top-level-function
+
+                // hls-fpga-machine-learning insert output
+
+                // hls-fpga-machine-learning insert tb-output
+            }
+        } else {
+            // hls-fpga-machine-learning insert zero
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h b/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h
new file mode 100644
index 0000000000..0645efa73f
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/ap_shift_reg.h
@@ -0,0 +1,136 @@
+/*
+#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES.
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __SIM_AP_SHIFT_REG_H__
+#define __SIM_AP_SHIFT_REG_H__
+
+/*
+ * This file contains a C++ model of shift register.
+ * It defines C level simulation model.
+ */
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#else
+
+#ifndef __SYNTHESIS__
+#include <cassert>
+#endif
+
+//////////////////////////////////////////////
+// C level simulation model for ap_shift_reg
+//////////////////////////////////////////////
+template <typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32> class ap_shift_reg {
+  public:
+    /// Constructors
+    ap_shift_reg() {
+        for (unsigned int i = 0; i < __SHIFT_DEPTH__; i++) {
+            __SHIFT_T__ dummy;
+            Array[i] = dummy; // uninitialize so Catapult does not add a reset
+        }
+    }
+    ap_shift_reg(const char *name) {}
+    /// Destructor
+    virtual ~ap_shift_reg() {}
+
+  private:
+    /// Make copy constructor and assignment operator private
+    ap_shift_reg(const ap_shift_reg<__SHIFT_T__, __SHIFT_DEPTH__> &shreg) {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+    }
+
+    ap_shift_reg &operator=(const ap_shift_reg<__SHIFT_T__, __SHIFT_DEPTH__> &shreg) {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+        return *this;
+    }
+
+  public:
+    // Shift the queue, push to back and read from a given address.
+    __SHIFT_T__ shift(__SHIFT_T__ DataIn, unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true) {
+#ifndef __SYNTHESIS__
+        assert(Addr < __SHIFT_DEPTH__ && "Out-of-bound shift is found in ap_shift_reg.");
+#endif
+        __SHIFT_T__ ret = Array[Addr];
+        if (Enable) {
+            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
+                Array[i] = Array[i - 1];
+            Array[0] = DataIn;
+        }
+        return ret;
+    }
+
+    // Read from a given address.
+    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const {
+#ifndef __SYNTHESIS__
+        assert(Addr < __SHIFT_DEPTH__ && "Out-of-bound read is found in ap_shift_reg.");
+#endif
+        return Array[Addr];
+    }
+
+  protected:
+    __SHIFT_T__ Array[__SHIFT_DEPTH__];
+};
+
+#endif //__cplusplus
+
+#endif //__SIM_AP_SHIFT_REG_H__
diff --git a/hls4ml/templates/catapult/nnet_utils/hls_math.h b/hls4ml/templates/catapult/nnet_utils/hls_math.h
new file mode 100755
index 0000000000..ea05fe122a
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/hls_math.h
@@ -0,0 +1,24 @@
+#ifndef X_HLS_MATH_H
+#define X_HLS_MATH_H
+
+#include "ac_fixed.h"
+#include <cmath>
+
+namespace hls {
+
+template <class T> static T exp(const T x) { return (T)std::exp(x.to_double()); }
+
+template <typename T> T sin(T x) { return (T)std::sin(x.to_double()); };
+
+template <typename T> T cos(T x) { return (T)std::cos(x.to_double()); };
+
+template <typename T> T asin(T x) { return (T)std::asin(x.to_double()); };
+
+template <typename T> T acos(T x) { return (T)std::acos(x.to_double()); };
+
+template <typename T> T atan(T x) { return (T)std::atan(x.to_double()); };
+
+template <typename T> T atan2(T x, T y) { return (T)hls::atan2(x.to_double(), y.to_double()); };
+
+} // namespace hls
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h
new file mode 100644
index 0000000000..f08e75a0d6
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation.h
@@ -0,0 +1,1107 @@
+
+// Change History:
+//   2022-06-30  dgburnette - Cleaned up code to separate AC Math from LUT code.
+//                            Added LUT dump to text file.
+//                            Activation functions not implemented in AC Math will assert.
+//   2022-06-28  dgburnette - Replaced AP Types with AC Datatypes.
+//                            Commented out all Vivado pragmas.
+//                            Added Catapult hierarchy pragmas.
+//                            Started replacement of activation functions with
+//                            AC Math piecewise-linear versions.
+
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+// Define this macro to switch the implementations of certain activiation functions
+// from the original HLS4ML look-up table approach to using the piecewise-linear approximation
+// functions in AC Math.
+#define USE_AC_MATH 1
+
+#if !defined(USE_AC_MATH) && !defined(__SYNTHESIS__)
+// Define a macro that causes the look-up table generation code to dump out text files
+// of the array contents.
+// #define BUILD_TABLE_FILE 1
+#endif
+
+#include "ac_fixed.h"
+#include "ac_std_float.h"
+#include "nnet_common.h"
+#include <ac_math/ac_elu_pwl.h>
+#include <ac_math/ac_pow_pwl.h>
+#include <ac_math/ac_relu.h>
+#include <ac_math/ac_selu_pwl.h>
+#include <ac_math/ac_sigmoid_pwl.h>
+#include <ac_math/ac_softmax_pwl.h>
+#include <ac_math/ac_softplus_pwl.h>
+#include <ac_math/ac_softsign_pwl.h>
+#include <ac_math/ac_tanh_pwl.h>
+#include <cmath>
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ac_fixed<18, 8, true> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = data[ii];
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+#ifndef USE_AC_MATH
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+#else
+        ac_math::ac_relu(datareg, res[ii]);
+#endif
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+
+template </*unsigned K,*/ int W1, int I1, bool S1, ac_q_mode Q1, ac_o_mode O1, int W2, int I2, bool S2, ac_q_mode Q2,
+          ac_o_mode O2>
+void ac_sigmoid_pwl_wrapper(const ac_fixed<W1, I1, S1, Q1, O1>(&input) /*[K]*/,
+                            ac_fixed<W2, I2, S2, Q2, O2>(&output) /*[K]*/) {
+    ac_fixed<W2, I2, false, Q2, O2> tmp; //[K];
+    ac_math::ac_sigmoid_pwl<AC_TRN, W1, I1, true, Q1, O1, W2, I2, Q2, O2>(input, tmp);
+    output = tmp;
+}
+
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
+
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "sigmoid_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_sigmoid_table()\n");
+#endif
+    // Default logistic sigmoid function:
+    //   result = 1/(1+e^(-x))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", sigmoid_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // sigmoid(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16;
+        index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = (int)CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        // res[ii] = ac_math::ac_sigmoid_pwl(data[ii]);
+        ac_sigmoid_pwl_wrapper(data[ii], res[ii]);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2 };
+
+inline float exp_fcn_float(float input) { return std::exp(input); }
+
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+    // Treat the index as the top N bits
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    data_T x(0);
+    // CATAPULT_PORT
+    // x(x.width-1, x.width-N) = i;
+    ac_int<N, false> tmp = i;
+    x.template set_slc(x.width - N, tmp);
+    return (float)x.to_double();
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    // CATAPULT_PORT
+    // ac_int<N,false> y = x(x.width-1, x.width-N); // slice the top N bits of input
+    // return (unsigned) y(N-1, 0);
+    ac_int<N, false> y = x.template slc<N>(x.width - N); // slice the top N bits of input
+    return (unsigned)y.template slc<N>(0);
+}
+
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "exp_table%d.tab", CONFIG_T::table_size);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_exp_table()\n");
+#endif
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        // Slicing bits for address is going to round towards 0, so take the central value
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
+        table_out[i] = exp_x;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", exp_fcn_float(x));
+        if (i < CONFIG_T::table_size - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // exp(%32.31f)", x);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "invert_table%d.tab", CONFIG_T::table_size);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_invert_table()\n");
+#endif
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+#ifdef __SYNTHESIS__
+        // hack for now to get through the flow
+        typename CONFIG_T::inv_table_t inv_x = 1 + x;
+#else
+        typename CONFIG_T::inv_table_t inv_x = 1 / x;
+#endif
+        table_out[i] = inv_x;
+#ifdef BUILD_TABLE_FILE
+        if (x > 0.0)
+            fprintf(f, "%32.31f", (1.0 / x));
+        else
+            fprintf(f, "%32.31f", 0.0);
+        if (i < CONFIG_T::table_size - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // 1/(%32.31f)", x);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    //#pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Find the max and compute all delta(x_i, x_max)
+    Op_max<data_T> op_max;
+    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    //#pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+#endif
+
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "exp_table_legacy%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_exp_table_legacy()\n");
+#endif
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", exp_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // exp(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "invert_table_legacy%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_invert_table_legacy()\n");
+#endif
+    // Inversion function:
+    //   result = 1/x
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
+        float in_val = 64.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
+#ifdef BUILD_TABLE_FILE
+        if (in_val > 0.0)
+            fprintf(f, "%32.31f", (1.0 / in_val));
+        else
+            fprintf(f, "%32.31f", 0.0);
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // 1/%32.31f", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
+    data_T data_cache[CONFIG_T::n_in];
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_cache[ii] = data[ii];
+        exp_res[ii] = 0;
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj)
+                exp_diff_res = 1;
+            else {
+                // CATAPULT_PORT
+                // data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/16;
+                auto tmp_data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
+                data_round = tmp_data_round.to_int();
+                index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                exp_diff_res = exp_table[index];
+            }
+            exp_res[ii] += exp_diff_res;
+        }
+    }
+
+    // Second loop to invert
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        // CATAPULT_PORT
+        // int exp_res_index = exp_res[ii]*CONFIG_T::table_size/64;
+        auto tmp_exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        int exp_res_index = tmp_exp_res_index.to_int();
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+        res[ii] = (res_T)invert_table[exp_res_index];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+#else
+// This is a workaround to help the template deduction to work correctly and fix the inconsistency that HLS4ML expects
+// softmax output to be signed but AC Math softmax knows it is always unsigned
+template <unsigned K, int W1, int I1, bool S1, ac_q_mode Q1, ac_o_mode O1, int W2, int I2, bool S2, ac_q_mode Q2,
+          ac_o_mode O2>
+void ac_softmax_pwl_wrapper(const ac_fixed<W1, I1, S1, Q1, O1> (&input)[K], ac_fixed<W2, I2, S2, Q2, O2> (&output)[K]) {
+    ac_fixed<W2, I2, false, Q2, O2> tmp[K];
+    ac_math::ac_softmax_pwl<AC_TRN, false, 0, 0, AC_TRN, AC_WRAP, false, 0, 0, AC_TRN, AC_WRAP, K, W1, I1, S1, Q1, O1, W2,
+                            I2, Q2, O2>(input, tmp);
+    for (unsigned int x = 0; x < K; x++)
+        output[x] = tmp[x];
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    data_T data_copy[CONFIG_T::n_in];
+    res_T res_copy[CONFIG_T::n_in];
+// workaround for the array passing - alternative is to change the signature of all of the functions to reference-of-array
+COPY_IN_ARRAY:
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++)
+        data_copy[i] = data[i];
+    ac_softmax_pwl_wrapper(data_copy, res_copy);
+COPY_OUT_ARRAY:
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++)
+        res[i] = res_copy[i];
+}
+
+#endif
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "tanh_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_tanh_table()\n");
+#endif
+    // Implement tanh lookup
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = tanh(in_val);
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", tanh(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // tanh(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 8;
+        index = data_round + 4 * (int)CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = (int)CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        // res[ii] = ac_math::ac_tanh_pwl(data[ii]);
+        ac_math::ac_tanh_pwl(data[ii], res[ii]);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    data_T slope = (data_T)0.2;
+    data_T shift = (data_T)0.5;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = slope * data[ii] + shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+// *************************************************
+//       Hard TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    data_T slope = (data_T)0.2;
+    data_T shift = (data_T)0.5;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            datareg = 1;
+        else if (sigmoid < 0)
+            datareg = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "softplus_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_softplus_table()\n");
+#endif
+    // Default softplus function:
+    //   result = log(exp(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", softplus_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // softplus(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16;
+        index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = (int)CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+#else
+template <ac_q_mode pwl_Q = AC_TRN, int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int outW, int outI, bool outS,
+          ac_q_mode outQ, ac_o_mode outO>
+void ac_softplus_pwl_wrapper(const ac_fixed<W, I, S, Q, O>(&input), ac_fixed<outW, outI, outS, outQ, outO>(&output)) {
+    ac_fixed<outW, outI, false, outQ, outO> tmp;
+    ac_math::ac_softplus_pwl<AC_TRN, W, I, S, Q, O, outW, outI, outQ, outO>(input, tmp);
+    output = tmp;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_softplus_pwl_wrapper(data[ii], res[ii]);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "softsign_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_softsign_table()\n");
+#endif
+    // Default softsign function:
+    //   result = x / (abs(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", softsign_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // softsign(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii].to_double() * (int)CONFIG_T::table_size / 16;
+        index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = (int)CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        // res[ii] = ac_math::ac_softsign_pwl(data[ii]);
+        ac_math::ac_softsign_pwl(data[ii], res[ii]);
+    }
+}
+
+#endif
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
+
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "elu_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_elu_table()\n");
+#endif
+    // Default ELU function:
+    //   result = alpha * (e^(x) - 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", elu_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // elu(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            index = datareg.to_double() * (int)CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_math::ac_elu_pwl(data[ii], res[ii], alpha);
+    }
+}
+
+#endif
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+inline float selu_fcn_float(float input) {
+    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+#ifdef BUILD_TABLE_FILE
+    char filename[1024];
+    sprintf(filename, "selu_table%d.tab", N_TABLE);
+    FILE *f = fopen(filename, "w");
+    fprintf(f, "// init_selu_table()\n");
+#endif
+    // Default SELU function:
+    //   result = 1.05 * (1.673 * (e^(x) - 1))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+#ifdef BUILD_TABLE_FILE
+        fprintf(f, "%32.31f", selu_fcn_float(in_val));
+        if (ii < N_TABLE - 1)
+            fprintf(f, ",");
+        fprintf(f, "   // selu(%32.31f)", in_val);
+        fprintf(f, "\n");
+#endif
+    }
+#ifdef BUILD_TABLE_FILE
+    fclose(f);
+#endif
+}
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            index = datareg.to_double() * (int)CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = ac_math::ac_selu_pwl<res_T>(data[ii]);
+    }
+}
+
+#endif
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h
new file mode 100644
index 0000000000..509560bc2b
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_activation_stream.h
@@ -0,0 +1,922 @@
+
+// Change History:
+//   2022-06-30  dgburnette - Cleaned up code to separate AC Math from LUT code.
+//                            Activation functions not implemented in AC Math will assert.
+//   2022-06-28  dgburnette - Replaced AP Types with AC Datatypes.
+
+#ifndef NNET_ACTIVATION_STREAM_H_
+#define NNET_ACTIVATION_STREAM_H_
+
+#include "ac_channel.h"
+#include "ac_fixed.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_stream.h"
+#include "nnet_types.h"
+#include <ac_math/ac_elu_pwl.h>
+#include <ac_math/ac_pow_pwl.h>
+#include <ac_math/ac_relu.h>
+#include <ac_math/ac_selu_pwl.h>
+#include <ac_math/ac_sigmoid_pwl.h>
+#include <ac_math/ac_softmax_pwl.h>
+#include <ac_math/ac_softplus_pwl.h>
+#include <ac_math/ac_softsign_pwl.h>
+#include <ac_math/ac_tanh_pwl.h>
+#include <ac_std_float.h>
+#include <cmath>
+
+namespace nnet {
+
+// *************************************************
+//       LINEAR Activation
+// *************************************************
+// Adding this to work around problem with Catapult and SR model where the output channel appears to be inout
+template <class data_T, class res_T, typename CONFIG_T> void linear(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+LinearActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    LinearPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = in_data[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+ReLUActLoop:
+    for (unsigned int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    ReLUPackLoop:
+        for (unsigned int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+#ifndef USE_AC_MATH
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+#else
+            ac_math::ac_relu(in_data[j], out_data[j]);
+#endif
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+SigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    SigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16;
+            int index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            out_data[j] = sigmoid_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+SigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+    SigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            // ac_math::ac_sigmoid_pwl(in_data[j], out_data[j]);
+            ac_sigmoid_pwl_wrapper(in_data[j], out_data[j]);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+    (void)ii;
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[data_T::size];
+    //#pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+
+SoftmaxExpLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE II=ii
+
+        data_T in_pack = data.read();
+    SoftmaxExpPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+    //#pragma HLS DATA_PACK variable=out_pack
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+    (void)ii;
+
+    typename data_T::value_type data_array[data_T::size];
+    //#pragma HLS ARRAY_PARTITION variable=data_array complete
+
+    if constexpr (ii == 1) {
+    }
+    if constexpr (ii != 1) {
+        // future enhancement for Catapult
+    }
+SoftmaxArrayLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE II=ii
+
+        data_T in_pack = data.read();
+    SoftmaxArrayPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            data_array[j] = in_pack[j];
+        }
+
+        // Find the max and compute all delta(x_i, x_max)
+        Op_max<typename data_T::value_type> op_max;
+        typename data_T::value_type x_max =
+            reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
+
+        // For the diffs, use the same type as the input but force rounding and saturation
+        ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT> d_xi_xmax[data_T::size];
+        for (unsigned j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            d_xi_xmax[j] = data_array[j] - x_max;
+        }
+
+        // Calculate all the e^x's
+        typename CONFIG_T::exp_table_t exp_res[data_T::size];
+        //#pragma HLS ARRAY_PARTITION variable=exp_res complete
+        typename CONFIG_T::exp_table_t exp_sum(0);
+        for (unsigned j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+    //#pragma HLS DATA_PACK variable=out_pack
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[data_T::size];
+    typename CONFIG_T::table_t exp_diff_res;
+    typename data_T::value_type data_cache[data_T::size];
+
+SoftmaxInitLoop:
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+        //#pragma HLS PIPELINE
+        data_T in_pack = data.read();
+    SoftmaxInitPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            data_cache[j] = in_pack[j];
+            exp_res[j] = 0;
+        }
+
+    SoftmaxExpLoop:
+        for (int i = 0; i < data_T::size; i++) {
+        //#pragma HLS UNROLL
+        SoftmaxExpInner:
+            for (int j = 0; j < data_T::size; j++) {
+                //#pragma HLS UNROLL
+
+                if (i == j) {
+                    exp_diff_res = 1;
+                } else {
+                    int data_round =
+                        (data_cache[j].to_double() - data_cache[i].to_double()) * (int)CONFIG_T::table_size / 16;
+                    int index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = (int)CONFIG_T::table_size - 1;
+                    exp_diff_res = exp_table[index];
+                }
+
+                exp_res[i] += exp_diff_res;
+            }
+        }
+
+        res_T out_pack;
+    //#pragma HLS DATA_PACK variable=out_pack
+    SoftmaxInvPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+
+            int exp_res_index = exp_res[j].to_double() * (int)CONFIG_T::table_size / 64;
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = (int)CONFIG_T::table_size - 1;
+
+            out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index];
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::axis == -1);
+
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    typename data_T::value_type data_cache[data_T::size];
+    typename res_T::value_type res_cache[res_T::size];
+SoftmaxInitLoop:
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+        data_T in_pack = data.read();
+
+    SoftmaxInitPackLoop:
+        for (unsigned j = 0; j < data_T::size; j++) {
+            data_cache[j] = in_pack[j];
+        }
+
+        res_T out_pack;
+        // ac_math::ac_softmax_pwl(data_cache,res_cache);
+        ac_softmax_pwl_wrapper(data_cache, res_cache);
+
+    SoftmaxResPackLoop:
+        for (unsigned j = 0; j < res_T::size; j++) {
+            out_pack[j] = res_cache[j];
+        }
+
+        res.write(out_pack);
+    }
+}
+
+#endif
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+TanHActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    TanHPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 8;
+            int index = data_round + 4 * (int)CONFIG_T::table_size / 8;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            out_data[j] = tanh_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+TanHActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+    TanHPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            // int data_round = in_data[j]*CONFIG_T::table_size/8;
+            ac_math::ac_tanh_pwl(in_data[j], out_data[j]);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_sigmoid(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    typename data_T::value_type slope = (typename data_T::value_type)0.2;
+    typename data_T::value_type shift = (typename data_T::value_type)0.5;
+
+HardSigmoidActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    HardSigmoidPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            typename data_T::value_type datareg = slope * in_data[j] + shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
+            out_data[j] = datareg;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Hard TanH Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // typename data_T::value_type slope = (typename data_T::value_type) 0.2;
+    // typename data_T::value_type shift = (typename data_T::value_type) 0.5;
+
+HardTanhActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    HardTanhPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(ac_channel<data_T> &data, typename data_T::value_type alpha, ac_channel<res_T> &res) {
+LeakyReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    LeakyReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(ac_channel<data_T> &data, typename data_T::value_type theta, ac_channel<res_T> &res) {
+ThresholdedReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    ThresholdedReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void softplus(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+SoftplusActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    SoftplusPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16;
+            int index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            out_data[j] = softplus_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void softplus(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+SoftplusActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+    SoftplusPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            ac_softplus_pwl_wrapper(in_data[j], out_data[j]);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void softsign(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+SoftsignActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    SoftsignPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            int data_round = in_data[j].to_double() * (int)CONFIG_T::table_size / 16;
+            int index = data_round + 8 * (int)CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = (int)CONFIG_T::table_size - 1;
+            out_data[j] = softsign_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void softsign(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+SoftsignActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+    SoftsignPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            ac_math::ac_softsign_pwl(in_data[j], out_data[j]);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(ac_channel<data_T> &data, typename data_T::value_type alpha, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+EluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    EluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = datareg;
+            } else {
+                int index = (int)datareg.to_double() * (int)CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = alpha * elu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+#else
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(ac_channel<data_T> &data, typename data_T::value_type alpha, ac_channel<res_T> &res) {
+EluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+    EluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            ac_math::ac_elu_pwl(in_data[j], out_data[j], alpha);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+
+#ifndef USE_AC_MATH
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+SeluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    SeluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg;
+            } else {
+                int index = (int)datareg.to_double() * (int)CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = (int)CONFIG_T::table_size - 1;
+                out_data[j] = selu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+#else
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+SeluActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+    SeluPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            ac_math::ac_selu_pwl(in_data[j], out_data[j]);
+        }
+        res.write(out_data);
+    }
+}
+
+#endif
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(ac_channel<data_T> &data, typename data_T::value_type alpha[CONFIG_T::n_in], ac_channel<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_T::size + j] * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void binary_tanh(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            if (in_data[j] > 0)
+                out_data[j] = (typename res_T::value_type)1;
+            else
+                out_data[j] = (typename res_T::value_type) - 1;
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void ternary_tanh(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+PReLUActLoop:
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    PReLUPackLoop:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            if (in_data[j] > 1)
+                out_data[j] = (typename res_T::value_type)1;
+            else if (in_data[j] <= -1)
+                out_data[j] = (typename res_T::value_type) - 1;
+            else
+                out_data[j] = (typename res_T::value_type)0;
+        }
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_array.h b/hls4ml/templates/catapult/nnet_utils/nnet_array.h
new file mode 100755
index 0000000000..cd3b73cf73
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_array.h
@@ -0,0 +1,52 @@
+#ifndef NNET_ARRAY_H_
+#define NNET_ARRAY_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    unsigned dims_t[3];
+    dims_t[0] = dims[CONFIG_T::perm[0]];
+    dims_t[1] = dims[CONFIG_T::perm[1]];
+    dims_t[2] = dims[CONFIG_T::perm[2]];
+
+    int idx[3] = {0}, idx_t[3] = {0};
+    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
+        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
+            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
+                idx_t[0] = idx[CONFIG_T::perm[0]];
+                idx_t[1] = idx[CONFIG_T::perm[1]];
+                idx_t[2] = idx[CONFIG_T::perm[2]];
+
+                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
+                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h
new file mode 100644
index 0000000000..1db18043ec
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm.h
@@ -0,0 +1,127 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const int n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    data_T cache;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    //#pragma HLS ARRAY_PARTITION variable=scale complete
+    //#pragma HLS ARRAY_PARTITION variable=bias complete
+
+    int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+
+    // Calcuate result
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const int n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
+                           data_T threshold[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ac_int<1, false> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
+                            data_T threshold_hi[CONFIG_T::n_in], data_T threshold_lo[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ac_int<2, true> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h
new file mode 100644
index 0000000000..48085f82dc
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_batchnorm_stream.h
@@ -0,0 +1,113 @@
+
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(ac_channel<data_T> &data, ac_channel<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS ARRAY_PARTITION variable=scale complete
+    //#pragma HLS ARRAY_PARTITION variable=bias complete
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
+    (void)ii;
+    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+
+BatchNormLoop:
+    for (unsigned int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE II=ii
+
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    BatchNormpack:
+        for (unsigned int j = 0; j < data_T::size; j++) {
+            // #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<1, false>, CONFIG_T::n_in>> &res,
+                           typename data_T::value_type threshold[CONFIG_T::n_in]) {
+    //#pragma HLS ARRAY_PARTITION variable=threshold complete
+
+BinaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            out_data[j] = (in_data[j] > threshold[i * data_T::size + j]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(ac_channel<data_T> &data, ac_channel<nnet::array<ac_int<2, true>, CONFIG_T::n_in>> &res,
+                            typename data_T::value_type threshold_hi[CONFIG_T::n_in],
+                            typename data_T::value_type threshold_lo[CONFIG_T::n_in]) {
+    //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete
+    //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete
+
+TernaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+
+            int norm_index = i * data_T::size + j;
+
+            if (in_data[j] > threshold_hi[norm_index]) {
+                out_data[j] = 1;
+            } else if (in_data[j] <= threshold_lo[norm_index]) {
+                out_data[j] = -1;
+            } else {
+                out_data[j] = 0;
+            }
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h
new file mode 100755
index 0000000000..e4db43682e
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_code_gen.h
@@ -0,0 +1,32 @@
+#ifndef NNET_INSTR_GEN_H_
+#define NNET_INSTR_GEN_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
+  public:
+    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+                            const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+  public:
+    static void
+    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+                const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+// hls4ml insert code
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_common.h b/hls4ml/templates/catapult/nnet_utils/nnet_common.h
new file mode 100755
index 0000000000..b9b27209fa
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_common.h
@@ -0,0 +1,66 @@
+
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "ac_fixed.h"
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+
+    if (N == 1) {
+        return x[0];
+    } else if (N == 2) {
+        return op(x[0], x[1]);
+    } else {
+        return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+    }
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h
new file mode 100755
index 0000000000..98e075d4ab
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d.h
@@ -0,0 +1,62 @@
+
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h
new file mode 100755
index 0000000000..0323b1ac4b
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_latency.h
@@ -0,0 +1,198 @@
+#ifndef NNET_CONV1D_LATENCY_H_
+#define NNET_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include <cstdlib>
+
+namespace nnet {
+
+// Computes multiplier limit
+// This function should not be synthesized into firmware
+template <typename CONFIG_T>
+int compute_multiplier_limit(
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt]) {
+    int n_mult = 0;
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                for (int jj = 0; jj < CONFIG_T::filt_width; jj++) {
+
+                    int index_weight = jj * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff;
+
+                    if ((ii * CONFIG_T::stride_width + jj) < CONFIG_T::pad_left ||
+                        (ii * CONFIG_T::stride_width + jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                        // padded -- do nothing
+                        continue;
+                    } else {
+                        // need to tune this cut?
+                        if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) {
+                            n_mult++;
+                        } // end if nonzero weight
+                    }     // end not padding
+                }         // end loop accross filter
+            }             // end channel loop
+        }                 // end filter loop
+    }                     // end output loop
+
+    return ceil(float(n_mult) / float(CONFIG_T::reuse_factor));
+
+} // end compute_n_mult
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                        res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
+//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+            ConvMult:
+                for (int jj = 0; jj < CONFIG_T::filt_width; jj++) {
+
+                    int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width +
+                                     ff * CONFIG_T::n_chan * CONFIG_T::filt_width + cc * CONFIG_T::filt_width + jj;
+                    int index_weight = jj * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff;
+                    int index_data = (ii * CONFIG_T::stride_width + jj - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                    if ((ii * CONFIG_T::stride_width + jj) < CONFIG_T::pad_left ||
+                        (ii * CONFIG_T::stride_width + jj) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                        mult[index_mult] = 0;
+                    } else {
+                        mult[index_mult] = data[index_data] * weights[index_weight];
+                    }
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+            AccumDot:
+                for (int jj = 0; jj < CONFIG_T::filt_width; jj++) {
+                    int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_width +
+                                     ff * CONFIG_T::n_chan * CONFIG_T::filt_width + cc * CONFIG_T::filt_width + jj;
+                    acc[ii][ff] += mult[index_mult];
+                } // end dot product loop
+            }     // end channel loop
+        }         // end filter loop
+    }             // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width][CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    const int multiplier_limit = compute_multiplier_limit<CONFIG_T>(weights);
+//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = data[index_data] * weights[index_weight];
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            res[ii * CONFIG_T::n_filt + ff] = (res_T)(acc[ii][ff]);
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h
new file mode 100644
index 0000000000..143a1271ba
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_resource.h
@@ -0,0 +1,241 @@
+#ifndef NNET_CONV1D_RESOURCE_H_
+#define NNET_CONV1D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+               data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    // int index = 0;
+    for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) {
+        //#pragma HLS PIPELINE II=1 rewind
+        for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+            int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation;
+            for (int output_col = CONFIG_T::out_width; output_col; output_col--) {
+                if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                    *(data_col++) = data[input_col];
+                    // data_col[index] = data[input_col];
+                } else {
+                    *(data_col++) = 0;
+                    // data_col[index] = 0;
+                }
+                // index++;
+                input_col += CONFIG_T::stride_width;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_full(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                  typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    data_T data_conv[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_width];
+    data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    ////#pragma HLS ARRAY_PARTITION variable=data_conv complete
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+    im2col_1d<data_T, CONFIG_T>(data, data_conv);
+
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        for (int j = 0; j < CONFIG_T::filt_width * CONFIG_T::n_chan; j++) {
+            data_col[j] = data_conv[j * CONFIG_T::out_width + i];
+        }
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            // res[i * CONFIG_T::n_filt + j] = res_col[j];
+            res[j * CONFIG_T::out_width + i] = res_col[j]; // Transposed order
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_cf_idx(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                      data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) {
+ChannelLoop:
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+    //#pragma HLS PIPELINE II=1 rewind
+    KernelLoop:
+        for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+            int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation + col * CONFIG_T::stride_width;
+            if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                //*(data_col++) = data[input_col];
+                data_col[channel * CONFIG_T::filt_width + kernel_col] = data[channel * CONFIG_T::in_width + input_col];
+            } else {
+                //*(data_col++) = 0;
+                data_col[channel * CONFIG_T::filt_width + kernel_col] = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_cf(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                  data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_width], const int col) {
+    int index = 0;
+ChannelLoop:
+    for (int channel = CONFIG_T::n_chan; channel--; data += CONFIG_T::in_width) {
+    KernelLoop:
+        for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+            int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation + col * CONFIG_T::stride_width;
+            if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                //*(data_col++) = data[input_col];
+                data_col[index] = data[input_col];
+            } else {
+                //*(data_col++) = 0;
+                data_col[index] = 0;
+            }
+            index++;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width],
+                         res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                         typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                         typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+ColLoop:
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        //#pragma HLS PIPELINE
+        im2col_1d_cf<data_T, CONFIG_T>(data, data_col, i);
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            // res[i * CONFIG_T::n_filt + j] = res_col[j];
+            res[j * CONFIG_T::out_width + i] = res_col[j]; // Transposed order
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                  data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan], const int col) {
+    int index = 0;
+KernelLoop:
+    for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+
+    ChannelLoop:
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            int index_data = (col * CONFIG_T::stride_width + kernel_col - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+
+            if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+                data_col[index] = data[index_data];
+            } else {
+                data_col[index] = 0;
+            }
+            index++;
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_1d_pointwise_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], data_T data_col[CONFIG_T::n_chan],
+                            const int col) {
+    int index = 0;
+ChannelLoop:
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+
+        int index_data = (col * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + channel;
+
+        if (index_data >= 0 && index_data < CONFIG_T::in_width * CONFIG_T::n_chan) {
+            data_col[index] = data[index_data];
+        } else {
+            data_col[index] = 0;
+        }
+        index++;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                         res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                         typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                         typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+ColLoop:
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        //#pragma HLS PIPELINE
+        im2col_1d_cl<data_T, CONFIG_T>(data, data_col, i);
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            res[i * CONFIG_T::n_filt + j] = res_col[j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                   res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    const int nin = CONFIG_T::n_chan;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+ColLoop:
+    for (int i = 0; i < CONFIG_T::out_width; i++) {
+        //#pragma HLS PIPELINE
+        im2col_1d_pointwise_cl<data_T, CONFIG_T>(data, data_col, i);
+        dense_resource<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            res[i * CONFIG_T::n_filt + j] = res_col[j];
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h
new file mode 100644
index 0000000000..48f6244ce1
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,94 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_1d(const unsigned w_idx, ac_int<CONFIG_T::filt_width, false> *pixel_idx) {
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        // #pragma HLS UNROLL
+        unsigned sw_idx =
+            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
+                wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_encoded_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    ac_channel<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //  const int win_depth = CONFIG_T::out_width;
+    //  for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+    //      #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    //  }
+
+    //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    unsigned outputs_ready = 0;
+
+    ac_int<CONFIG_T::filt_width, false> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                        biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_buffer_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency);
+    (void)ce_reuse_factor;
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS inline region
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h
new file mode 100755
index 0000000000..01476a0449
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d.h
@@ -0,0 +1,84 @@
+
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cf(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cf<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        pointwise_conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        pointwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h
new file mode 100644
index 0000000000..29dd8ca633
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_latency.h
@@ -0,0 +1,392 @@
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include <cstdlib>
+
+namespace nnet {
+
+// Computes multiplier limit
+// This function should not be synthesized into firmware
+template <typename CONFIG_T>
+int compute_multiplier_limit_conv2d(typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width *
+                                                                        CONFIG_T::n_chan * CONFIG_T::n_filt]) {
+    int n_mult = 0;
+
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                    for (int fh = 0; fh < CONFIG_T::filt_height; fh++) {
+                        for (int fw = 0; fw < CONFIG_T::filt_width; fw++) {
+
+                            int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt +
+                                               fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff;
+
+                            if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top ||
+                                (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) ||
+                                (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left ||
+                                (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                                // padded - do nothing
+                                continue;
+                            } else {
+                                if (weights[index_weight] > 1e-20 || weights[index_weight] < -1e-20) {
+                                    n_mult++;
+                                }
+                            }
+
+                        } // end mult loop
+                    }     // end channel loop
+                }         // end filter width loop
+            }             // end filter height loop
+        }                 // end output width loop
+    }                     // end output height loop
+
+    // return ceil(float(n_mult) / float(CONFIG_T::reuse_factor));
+    return (n_mult + CONFIG_T::reuse_factor - 1) / CONFIG_T::reuse_factor;
+
+} // end compute_n_mult
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cf(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan *
+                                    CONFIG_T::filt_height * CONFIG_T::filt_width];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
+//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    ConvOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        ConvFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            ConvChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                ConvFiltHeight:
+                    for (int fh = 0; fh < CONFIG_T::filt_height; fh++) {
+                    ConvFiltWidth:
+                        for (int fw = 0; fw < CONFIG_T::filt_width; fw++) {
+
+                            int index_mult =
+                                oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height *
+                                    CONFIG_T::filt_width +
+                                ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw;
+
+                            int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt +
+                                               fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff;
+
+                            if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top ||
+                                (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) ||
+                                (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left ||
+                                (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                                mult[index_mult] = 0;
+                            } else {
+                                int index_data =
+                                    cc * CONFIG_T::in_height * CONFIG_T::in_width +
+                                    (oh * CONFIG_T::stride_height + fh - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                    (ow * CONFIG_T::stride_width + fw - CONFIG_T::pad_left);
+                                mult[index_mult] = data[index_data] * weights[index_weight];
+                            }
+
+                        } // end mult loop
+                    }     // end channel loop
+                }         // end filter width loop
+            }             // end filter height loop
+        }                 // end output width loop
+    }                     // end output height loop
+
+    // Initialize accumulator with input biases
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff];
+            }
+        }
+    }
+
+// Accumulate multiplication result
+AccumOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    AccumOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        AccumFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            // Do "dot product" sum within filter and sum over channels
+            AccumChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                AccumDotHeight:
+                    for (int fh = 0; fh < CONFIG_T::filt_height; fh++) {
+                    AccumDotWidth:
+                        for (int fw = 0; fw < CONFIG_T::filt_width; fw++) {
+
+                            int index_mult =
+                                oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height *
+                                    CONFIG_T::filt_width +
+                                ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw;
+                            int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+
+                            acc[index_acc] += mult[index_mult];
+
+                        } // end dot product filter width loop
+                    }     // end dot product filter height loop
+                }         // end n channel loop
+            }             // end n filter loop
+        }                 // end output width loop
+    }                     // end output height loop
+
+    // Cast to "res_t" type
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+            for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+                int res_index = ff * CONFIG_T::out_height * CONFIG_T::out_width + oh * CONFIG_T::out_width + ow;
+                int acc_index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+                res[res_index] = acc[acc_index];
+            }
+        }
+    }
+
+} // end conv2d
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan *
+                                    CONFIG_T::filt_height * CONFIG_T::filt_width];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
+//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    ConvOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        ConvFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            ConvChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                ConvFiltHeight:
+                    for (int fh = 0; fh < CONFIG_T::filt_height; fh++) {
+                    ConvFiltWidth:
+                        for (int fw = 0; fw < CONFIG_T::filt_width; fw++) {
+
+                            int index_mult =
+                                oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height *
+                                    CONFIG_T::filt_width +
+                                ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw;
+
+                            int index_weight = fh * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt +
+                                               fw * CONFIG_T::n_chan * CONFIG_T::n_filt + cc * CONFIG_T::n_filt + ff;
+
+                            if ((oh * CONFIG_T::stride_height + fh) < CONFIG_T::pad_top ||
+                                (oh * CONFIG_T::stride_height + fh) >= (CONFIG_T::pad_top + CONFIG_T::in_height) ||
+                                (ow * CONFIG_T::stride_width + fw) < CONFIG_T::pad_left ||
+                                (ow * CONFIG_T::stride_width + fw) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                                mult[index_mult] = 0;
+                            } else {
+                                int index_data = (oh * CONFIG_T::stride_height + fh - CONFIG_T::pad_top) *
+                                                     CONFIG_T::in_width * CONFIG_T::n_chan +
+                                                 (ow * CONFIG_T::stride_width + fw - CONFIG_T::pad_left) * CONFIG_T::n_chan +
+                                                 cc;
+                                mult[index_mult] = data[index_data] * weights[index_weight];
+                            }
+
+                        } // end mult loop
+                    }     // end channel loop
+                }         // end filter width loop
+            }             // end filter height loop
+        }                 // end output width loop
+    }                     // end output height loop
+
+    // Initialize accumulator with input biases
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff];
+            }
+        }
+    }
+
+// Accumulate multiplication result
+AccumOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    AccumOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        AccumFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            // Do "dot product" sum within filter and sum over channels
+            AccumChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                AccumDotHeight:
+                    for (int fh = 0; fh < CONFIG_T::filt_height; fh++) {
+                    AccumDotWidth:
+                        for (int fw = 0; fw < CONFIG_T::filt_width; fw++) {
+
+                            int index_mult =
+                                oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height *
+                                    CONFIG_T::filt_width +
+                                ow * CONFIG_T::n_filt * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                ff * CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width +
+                                cc * CONFIG_T::filt_height * CONFIG_T::filt_width + fh * CONFIG_T::filt_width + fw;
+                            int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+
+                            acc[index_acc] += mult[index_mult];
+
+                        } // end dot product filter width loop
+                    }     // end dot product filter height loop
+                }         // end n channel loop
+            }             // end n filter loop
+        }                 // end output width loop
+    }                     // end output height loop
+
+    // Cast to "res_t" type
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                int index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+                res[index] = (res_T)(acc[index]);
+            }
+        }
+    }
+
+} // end conv2d
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_latency_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    const int multiplier_limit = compute_multiplier_limit_conv2d<CONFIG_T>(weights);
+//#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    ConvOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        ConvFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            ConvChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+
+                    int index_mult = oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan +
+                                     ow * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+
+                    int index_weight = cc * CONFIG_T::n_filt + ff;
+
+                    if ((oh * CONFIG_T::stride_height) < CONFIG_T::pad_top ||
+                        (oh * CONFIG_T::stride_height) >= (CONFIG_T::pad_top + CONFIG_T::in_height) ||
+                        (ow * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                        (ow * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                        mult[index_mult] = 0;
+                    } else {
+                        int index_data =
+                            (oh * CONFIG_T::stride_height - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_chan +
+                            (ow * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+                        mult[index_mult] = data[index_data] * weights[index_weight];
+                    }
+                }
+            }
+        }
+    }
+
+    // Initialize accumulator with input biases
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                acc[oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff] = biases[ff];
+            }
+        }
+    }
+
+// Accumulate multiplication result
+AccumOutHeight:
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+    AccumOutWidth:
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+        AccumFilt:
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            // Do "dot product" sum within filter and sum over channels
+            AccumChan:
+                for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+
+                    int index_mult = oh * CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan +
+                                     ow * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                    int index_acc = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+
+                    acc[index_acc] += mult[index_mult];
+                }
+            }
+        }
+    }
+
+    // Cast to "res_t" type
+    for (int oh = 0; oh < CONFIG_T::out_height; oh++) {
+        for (int ow = 0; ow < CONFIG_T::out_width; ow++) {
+            for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+                int index = oh * CONFIG_T::out_width * CONFIG_T::n_filt + ow * CONFIG_T::n_filt + ff;
+                res[index] = (res_T)(acc[index]);
+            }
+        }
+    }
+
+} // end conv2d
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h
new file mode 100644
index 0000000000..c5e386b5e9
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_resource.h
@@ -0,0 +1,275 @@
+#ifndef NNET_CONV2D_RESOURCE_H_
+#define NNET_CONV2D_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+               data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height *
+                               CONFIG_T::out_width]) {
+    const int output_h = (CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom -
+                          (CONFIG_T::dilation_height * (CONFIG_T::filt_height - 1) + 1)) /
+                             CONFIG_T::stride_height +
+                         1;
+    const int output_w = (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right -
+                          (CONFIG_T::dilation_width * (CONFIG_T::filt_width - 1) + 1)) /
+                             CONFIG_T::stride_width +
+                         1;
+    const int channel_size = CONFIG_T::in_height * CONFIG_T::in_width;
+
+    for (int channel = CONFIG_T::n_chan; channel--; data += channel_size) {
+        for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) {
+            for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+                int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height;
+                for (int output_rows = output_h; output_rows; output_rows--) {
+                    if (input_row < 0 || input_row > CONFIG_T::in_height) {
+                        for (int output_cols = output_w; output_cols; output_cols--) {
+                            *(data_col++) = 0;
+                        }
+                    } else {
+                        int input_col = -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width;
+                        for (int output_col = output_w; output_col; output_col--) {
+                            if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                                *(data_col++) = data[input_row * CONFIG_T::in_width + input_col];
+                            } else {
+                                *(data_col++) = 0;
+                            }
+                            input_col += CONFIG_T::stride_width;
+                        }
+                    }
+                    input_row += CONFIG_T::stride_height;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_full(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    data_T data_conv[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::out_height *
+                     CONFIG_T::out_width];
+    data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    ////#pragma HLS ARRAY_PARTITION variable=data_conv complete
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+    im2col_2d<data_T, CONFIG_T>(data, data_conv);
+
+    for (int i = 0; i < CONFIG_T::out_height * CONFIG_T::out_width; i++) {
+        for (int j = 0; j < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; j++) {
+            data_col[j] = data[j * CONFIG_T::out_height * CONFIG_T::out_width + i];
+        }
+        dense<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        for (int j = 0; j < CONFIG_T::n_filt; j++) {
+            // res[i * CONFIG_T::n_filt + j] = res_col[j];
+            res[j * CONFIG_T::out_height * CONFIG_T::out_width + i] = res_col[j]; // Transposed order
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T data_col[CONFIG_T::n_chan * CONFIG_T::filt_height * CONFIG_T::filt_width], const int row,
+                  const int col) {
+    const int channel_size = CONFIG_T::in_height * CONFIG_T::in_width;
+    int index = 0;
+    for (int channel = CONFIG_T::n_chan; channel--; data += channel_size) {
+        for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) {
+            int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height;
+            for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+                if (input_row < 0 || input_row > CONFIG_T::in_height) {
+                    data_col[index++] = 0;
+                } else {
+                    int input_col =
+                        -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width;
+                    if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                        //*(data_col++) = data[input_row * CONFIG_T::in_width + input_col];
+                        data_col[index++] = data[input_row * CONFIG_T::in_width + input_col];
+                    } else {
+                        //*(data_col++) = 0;
+                        data_col[index++] = 0;
+                    }
+                    input_col += CONFIG_T::stride_width;
+                }
+            }
+            input_row += CONFIG_T::stride_height;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cf(
+    data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+HeightLoop:
+    for (int i = 0; i < CONFIG_T::out_height; i++) {
+    WidthLoop:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            //#pragma HLS PIPELINE
+            im2col_2d_cf<data_T, CONFIG_T>(data, data_col, i, j);
+            dense<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        FiltLoop:
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                // res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k];
+                res[k * CONFIG_T::out_height * CONFIG_T::out_width + i * CONFIG_T::out_width + j] =
+                    res_col[k]; // Transposed order
+            }
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                  data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan], const int row,
+                  const int col) {
+    int index = 0;
+    for (int kernel_row = 0; kernel_row < CONFIG_T::filt_height; kernel_row++) {
+        int input_row = -CONFIG_T::pad_top + kernel_row * CONFIG_T::dilation_height + row * CONFIG_T::stride_height;
+        for (int kernel_col = 0; kernel_col < CONFIG_T::filt_width; kernel_col++) {
+            for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+                if (input_row < 0 || input_row >= CONFIG_T::in_height) {
+                    data_col[index++] = 0;
+                } else {
+                    int input_col =
+                        -CONFIG_T::pad_left + kernel_col * CONFIG_T::dilation_width + col * CONFIG_T::stride_width;
+                    if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                        data_col[index++] =
+                            data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+                    } else {
+                        data_col[index++] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void im2col_2d_pointwise_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T data_col[CONFIG_T::n_chan], const int row, const int col) {
+    int index = 0;
+    int input_row = -CONFIG_T::pad_top + row * CONFIG_T::stride_height;
+
+ChannelLoop:
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        if (input_row < 0 || input_row >= CONFIG_T::in_height) {
+            data_col[index++] = 0;
+        } else {
+            int input_col = -CONFIG_T::pad_left + col * CONFIG_T::stride_width;
+            if (input_col >= 0 && input_col < CONFIG_T::in_width) {
+                data_col[index++] =
+                    data[input_row * CONFIG_T::in_width * CONFIG_T::n_chan + input_col * CONFIG_T::n_chan + channel];
+            } else {
+                data_col[index++] = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_resource_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    const int nin = CONFIG_T::n_chan * CONFIG_T::filt_width;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+HeightLoop:
+    for (int i = 0; i < CONFIG_T::out_height; i++) {
+    WidthLoop:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            //#pragma HLS PIPELINE
+            im2col_2d_cl<data_T, CONFIG_T>(data, data_col, i, j);
+            dense<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        FiltLoop:
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_resource_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                                   res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    const int nin = CONFIG_T::n_chan;
+    const int nout = CONFIG_T::n_filt;
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int block_factor = DIV_ROUNDUP(nin * nout, rufactor);
+
+    ////#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE         variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose
+    /// correctly
+    ////#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    ////#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    data_T data_col[CONFIG_T::n_chan];
+    res_T res_col[CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_col complete
+    //#pragma HLS ARRAY_PARTITION variable=res_col complete
+
+HeightLoop:
+    for (int i = 0; i < CONFIG_T::out_height; i++) {
+    WidthLoop:
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            //#pragma HLS PIPELINE
+            im2col_2d_pointwise_cl<data_T, CONFIG_T>(data, data_col, i, j);
+            dense<data_T, res_T, typename CONFIG_T::mult_config>(data_col, res_col, weights, biases);
+        FiltLoop:
+            for (int k = 0; k < CONFIG_T::n_filt; k++) {
+                res[i * CONFIG_T::out_width * CONFIG_T::n_filt + j * CONFIG_T::n_filt + k] = res_col[k];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h
new file mode 100644
index 0000000000..7e76be12a9
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv2d_stream.h
@@ -0,0 +1,117 @@
+#ifndef NNET_CONV2D_STREAM_H_
+#define NNET_CONV2D_STREAM_H_
+
+#include "ac_channel.h"
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_2d(const unsigned h_idx, const unsigned w_idx,
+                               ac_int<CONFIG_T::filt_height * CONFIG_T::filt_width, false> *pixel_idx) {
+    const unsigned sh_idx = CONFIG_T::template scale_index_height<CONFIG_T::filt_height, CONFIG_T::stride_height,
+                                                                  CONFIG_T::in_height>::scale_index(h_idx);
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        // #pragma HLS UNROLL
+
+        unsigned sw_idx = CONFIG_T::template scale_index_width<CONFIG_T::filt_width, CONFIG_T::stride_width,
+                                                               CONFIG_T::in_width>::scale_index(wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sh_idx * CONFIG_T::min_width + sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_encoded_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == CONFIG_T::filt_width);
+
+    ac_channel<typename data_T::value_type> data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        //#pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    unsigned outputs_ready = 0;
+
+    ac_int<CONFIG_T::filt_height * CONFIG_T::filt_width, false> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+                //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            compute_scaled_indices_2d<data_T, CONFIG_T>(i_ih, i_iw, pixel_idx);
+            compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                            biases, pixel_idx);
+        }
+    }
+}
+
+// Line Buffer
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_buffer_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                                    [CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency);
+    (void)ce_reuse_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            if (CONFIG_T::strategy == nnet::latency) {
+                //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            if (CONFIG_T::filt_height > 1) {
+                compute_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS inline region
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h
new file mode 100644
index 0000000000..4d92cbf69f
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_conv_stream.h
@@ -0,0 +1,398 @@
+#ifndef NNET_CONV_STREAM_H_
+#define NNET_CONV_STREAM_H_
+
+#include "ac_channel.h"
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+enum class conv_implementation { linebuffer = 0, encoded = 1 };
+
+// *************************************************
+//       Encoded Implementation (Vlad's)
+// *************************************************
+template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_gte_S(const unsigned idx) {
+    //#pragma HLS INLINE
+
+    if (idx < K - S) {
+        return idx;
+    }
+
+    constexpr unsigned nW = ((W - K) / S) * S + K;           // Nearest W without unused pixels on the right
+    constexpr unsigned sW = (DIV_ROUNDUP(K, S) - 1) * S + K; // Scaled W that behaves like original W
+    if (idx >= nW) {
+        return sW;
+    }
+
+    const unsigned r = nW - idx;
+    if (r <= K - S) {
+        return sW - r;
+    }
+
+    return K - S + (idx - (K - S)) % S;
+}
+
+template <unsigned K, unsigned S, unsigned W> unsigned scale_index_K_lt_S(const unsigned idx) {
+    //#pragma HLS INLINE
+
+    if (idx < S - K) {
+        return idx;
+    }
+
+    constexpr unsigned nW = ((W - K) / S) * S + K;           // Nearest W without unused pixels on the right
+    constexpr unsigned sW = (DIV_ROUNDUP(S, K) - 1) * S + K; // Scaled W that behaves like original W
+    if (idx >= nW) {
+        return sW;
+    }
+
+    const unsigned r = nW - idx;
+    if (r <= S - K) {
+        return sW - r;
+    }
+
+    return S - K + (idx - (S - K)) % S;
+}
+
+template <unsigned K, unsigned S, unsigned W> class scale_index_regular {
+  public:
+    static unsigned scale_index(const unsigned idx) {
+        // #pragma HLS INLINE
+
+        if (K >= S) {
+            return scale_index_K_gte_S<K, S, W>(idx);
+        } else {
+            return scale_index_K_lt_S<K, S, W>(idx);
+        }
+    }
+};
+
+template <unsigned K, unsigned S, unsigned W> class scale_index_unscaled {
+  public:
+    static unsigned scale_index(const unsigned idx) {
+        // #pragma HLS INLINE
+        return idx;
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void mult_buffer(ac_channel<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                 res_T &res_pack, ac_channel<res_T> &res_stream, unsigned &outputs_ready,
+                 typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=data complete
+    typename res_T::value_type res[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+InitData:
+    for (unsigned int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) {
+        // #pragma HLS UNROLL
+        data[id] = data_window[id].read();
+    }
+
+    //#pragma HLS INLINE region
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        // #pragma HLS UNROLL
+        if (res_T::size / CONFIG_T::n_filt == 1) {
+            res_pack[jj] = res[jj];
+        } else {
+            res_pack[outputs_ready * CONFIG_T::n_filt + jj] = res[jj];
+        }
+    }
+
+    if (res_T::size / CONFIG_T::n_filt == 1) {
+        res_stream.write(res_pack);
+    } else {
+        if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+            res_stream.write(res_pack);
+            outputs_ready = 0;
+        } else {
+            outputs_ready++;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_encoded(const data_T &in_elem,
+                            ac_channel<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                            ac_channel<res_T> &res, res_T &res_pack, unsigned &outputs_ready,
+                            typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                            typename CONFIG_T::bias_t biases[CONFIG_T::n_filt],
+                            ac_int<CONFIG_T::kernel_size, false> *pixel_idx) {
+    //#pragma HLS INLINE
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+MultLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    CopyDataFilt:
+        for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) {
+            // #pragma HLS UNROLL
+        CopyDataChan:
+            for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+                // #pragma HLS UNROLL
+                if (pixel_idx[p][f])
+                    data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]);
+            }
+        }
+        if (pixel_idx[p][CONFIG_T::kernel_size - 1]) {
+            mult_buffer<data_T, res_T, CONFIG_T>(data_window, res_pack, res, outputs_ready, weights, biases);
+        }
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's)
+// *************************************************
+template <class data_T, typename CONFIG_T>
+void kernel_shift_1d(const data_T &in_elem,
+                     typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+    //#pragma HLS inline
+    //#pragma HLS PIPELINE II = 1
+
+    // Shift kernel_window by one step to the left (manual shift operation)
+    static const int filt_width = CONFIG_T::filt_width - 1;
+KernelShiftWidth:
+    for (int i_iw = 0; i_iw < filt_width; i_iw++) {
+        // #pragma HLS PIPELINE II = 1
+    KernelShiftChannel:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+            // #pragma HLS UNROLL
+            // Shift every element in kernel_window to the left
+            kernel_window[i_iw * CONFIG_T::n_chan + i_ic] = kernel_window[(i_iw + 1) * CONFIG_T::n_chan + i_ic];
+        }
+    }
+
+    // Insert shift_buffer column into right-most column of kernel
+    static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan;
+KernelPushChannel:
+    for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+        // #pragma HLS UNROLL
+        kernel_window[lastheight + i_ic] = in_elem[i_ic];
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void kernel_shift_2d(
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::filt_height * CONFIG_T::n_chan]) {
+    //#pragma HLS inline
+
+    // Shift kernel_window by one step to the left (manual shift operation)
+    static const int filt_width = CONFIG_T::filt_width - 1;
+KernelShiftWidth:
+    for (int i_iw = 0; i_iw < filt_width; i_iw++) {
+        //#pragma HLS PIPELINE II = 1
+    KernelShiftHeight:
+        for (unsigned i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) {
+        KernelShiftChannel:
+            for (unsigned i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+                // Shift every element in kernel_window to the left
+                kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_iw * CONFIG_T::n_chan + i_ic] =
+                    kernel_window[i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + (i_iw + 1) * CONFIG_T::n_chan + i_ic];
+            }
+        }
+    }
+
+    // Insert shift_buffer column into right-most column of kernel
+    static const int lastheight = (CONFIG_T::filt_width - 1) * CONFIG_T::n_chan;
+KernelPushHeight:
+    for (unsigned int i_ih = 0; i_ih < CONFIG_T::filt_height; i_ih++) {
+    KernelPushChannel:
+        for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+            kernel_window[lastheight + i_ih * CONFIG_T::filt_width * CONFIG_T::n_chan + i_ic] = shift_buffer[i_ih][i_ic];
+        }
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer(
+    const data_T &in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                             [CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan]) {
+
+    //#pragma HLS PIPELINE
+
+    // Temporary buffer for popped (shifted) elements
+    typename data_T::value_type shift_buffer[CONFIG_T::filt_height][CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable = shift_buffer complete dim = 0
+
+UpdateBuffer:
+    for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+        // #pragma HLS UNROLL
+
+        // Insert pixel(s) at end of shift buffer
+        shift_buffer[CONFIG_T::filt_height - 1][i_ic] = in_elem[i_ic];
+    }
+
+LineBufferDataIn:
+    for (unsigned int i_ic = 0; i_ic < CONFIG_T::n_chan; i_ic++) {
+        // Shift the shift buffer into the line buffer
+    LineBufferShift:
+        for (unsigned i_ih = 1; i_ih < CONFIG_T::filt_height; i_ih++) {
+            // #pragma HLS UNROLL
+            typename data_T::value_type pop_elem = line_buffer[i_ih - 1][i_ic].shift(
+                shift_buffer[CONFIG_T::filt_height - i_ih][i_ic]); // Shift the line buffer, return the popped pixel
+            shift_buffer[CONFIG_T::filt_height - i_ih - 1][i_ic] =
+                pop_elem; // Popped element placed back into shift_buffer, one row up.
+        }
+    }
+    kernel_shift_2d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_2d(
+    const data_T &in_elem,
+    ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::filt_height - 1, 1)]
+                                                                             [CONFIG_T::n_chan],
+    ac_channel<res_T> &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+    const static int lShiftY = CONFIG_T::filt_height - 1;
+
+    // Counters
+    static int pX = 0; // Pixel X
+    static int pY = 0; // Pixel Y
+
+    static int sX = 0; // Stride X
+    static int sY = 0; // Stride Y
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+
+    // Add pixel to buffer
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+
+        // Dense multiply
+        //#pragma HLS INLINE region
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        } else {
+            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        }
+
+        // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            // #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else {
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+// Conv 1D compute output
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_1d(
+    const data_T &in_elem, ac_channel<res_T> &res_stream,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+
+    // Counters
+    static int pX = 0; // pixel counter
+    static int sX = 0; // stride counter
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+
+    // Add pixel to buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+
+        // Dense multiply
+        //#pragma HLS INLINE region
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        } else {
+            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+                kernel_data, res_out, weights, biases);
+        }
+
+        // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            // #pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h
new file mode 100644
index 0000000000..64b927cc64
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense.h
@@ -0,0 +1,49 @@
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    //#pragma HLS inline
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h
new file mode 100644
index 0000000000..f3f27b6db8
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_compressed.h
@@ -0,0 +1,106 @@
+//
+//    hls4ml: Vivado HLS code for neural-net building blocks
+//
+//    Copyright (C) 2018 Giuseppe Di Guglielmo
+//
+//    This program is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    This program is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU General Public License for more details.
+//
+//    You should have received a copy of the GNU General Public License
+//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        // #pragma HLS UNROLL
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc    complete
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+    // if (CONFIG_T::store_weights_in_bram){
+    ////#pragma HLS RESOURCE variable=weights core=ROM_1P_BRAM
+    //#pragma HLS data_pack variable=weights struct_level
+    //}
+
+InitAccum:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE  II=1 rewind
+
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            // #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            // #pragma HLS UNROLL
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        // #pragma HLS UNROLL
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h
new file mode 100644
index 0000000000..40e5cd2b9d
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_latency.h
@@ -0,0 +1,92 @@
+
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    // Partial unroll config
+    constexpr int prod1_unroll =
+        (ce_reuse_factor < CONFIG_T::n_in) ? CONFIG_T::n_in : (int)(CONFIG_T::n_in * CONFIG_T::n_out) / ce_reuse_factor;
+    constexpr int prod2_unroll = (int)CONFIG_T::n_out / ce_reuse_factor;
+
+    (void)ce_reuse_factor; // to silence compiler warnings
+    (void)prod1_unroll;
+    (void)prod2_unroll;
+
+    // For Catapult, add an extra scope so that we can apply the pipeline pragma as if it applied to the function
+    do {
+        data_T cache;
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+        typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+        // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+        //#pragma HLS function_instantiate variable=weights,biases
+
+        // For parallel inputs:
+        //   - completely partition arrays -- target fabric
+        //   - if we have an unroll factor, limit number of multipliers
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression
+        // sometimes
+        //#pragma HLS ARRAY_PARTITION variable=biases complete
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+        //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+        // int multiplier_limit  = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) -
+        // floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
+        constexpr int multiplier_limit =
+            ((CONFIG_T::n_in * CONFIG_T::n_out) / CONFIG_T::reuse_factor) - CONFIG_T::n_zeros / CONFIG_T::reuse_factor;
+        CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
+    // Do the matrix-multiply
+    Product1:
+        for (unsigned int ii = 0; ii < CONFIG_T::n_in; ii++) {
+            cache = data[ii];
+        Product2:
+            for (unsigned int jj = 0; jj < CONFIG_T::n_out; jj++) {
+                int index = ii * CONFIG_T::n_out + jj;
+                mult[index] =
+                    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+            }
+        }
+
+    // Initialize accumulator with input biases
+    ResetAccum:
+        for (unsigned int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+            acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+        }
+
+    // Accumulate multiplication result
+    Accum1:
+        for (unsigned int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        Accum2:
+            for (unsigned int jj = 0; jj < CONFIG_T::n_out; jj++) {
+                int index = ii * CONFIG_T::n_out + jj;
+                acc[jj] += mult[index];
+            }
+        }
+
+    // Cast to "res_t" type
+    Result:
+        for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) {
+            // res[ires] = (res_T) (acc[ires]);
+            res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+        }
+    } while (false); // one iteration loop
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h
new file mode 100644
index 0000000000..5bcd1a54b7
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_resource.h
@@ -0,0 +1,262 @@
+
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    unsigned int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    for (unsigned int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        for (unsigned int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        //#pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            unsigned int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            //#pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            //#pragma HLS UNROLL
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (unsigned int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    //#pragma HLS INLINE region
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 0000000000..665d2f43f3
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,72 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_types.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    //#pragma HLS INLINE region
+    if (CONFIG_T::strategy == nnet::latency) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(ac_channel<data_T> &data_stream, ac_channel<res_T> &res_stream,
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    typename data_T::value_type data[CONFIG_T::n_in];
+    //#pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    if ((CONFIG_T::n_in / data_T::size) > 1) {
+    }
+DataPrepare:
+    for (unsigned int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_in / data_T::size > 1) {
+            //#pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (unsigned int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            data[i_in * data_T::size + i_pack] = data_pack[i_pack];
+        }
+    }
+
+    dense_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+
+    if ((CONFIG_T::n_out / res_T::size) > 1) {
+    }
+ResWrite:
+    for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) {
+        if (CONFIG_T::n_out / res_T::size > 1) {
+            //#pragma HLS PIPELINE
+        }
+        res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    ResPack:
+        for (unsigned int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            res_pack[i_pack] = res[i_out * res_T::size + i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h
new file mode 100644
index 0000000000..4cdf507f9d
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed.h
@@ -0,0 +1,47 @@
+#ifndef NNET_EMBED_H_
+#define NNET_EMBED_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+struct embed_config {
+    // Internal data type definitions
+    typedef float embeddings_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 16;
+    static const unsigned vocab_size = 50;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in * CONFIG_T::n_out],
+               typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    // This can save a few cycles, but it will create a large multiplexer due to
+    // non-constant access pattern, so let's leave it out
+    ////#pragma HLS ARRAY_PARTITION variable=embeddings complete
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+InputSequence:
+    for (int j = 0; j < CONFIG_T::n_in; j++) {
+    // #pragma HLS UNROLL
+    DenseEmbedding:
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            // #pragma HLS UNROLL
+            res[j * CONFIG_T::n_out + i] = embeddings[data[j] * CONFIG_T::n_out + i];
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h
new file mode 100644
index 0000000000..1378100879
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_embed_stream.h
@@ -0,0 +1,34 @@
+#ifndef NNET_EMBED_STREAM_H_
+#define NNET_EMBED_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void embedding(ac_channel<data_T> &data, ac_channel<res_T> &res,
+               typename CONFIG_T::embeddings_t embeddings[CONFIG_T::vocab_size * CONFIG_T::n_out]) {
+    data_T in_data = data.read();
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+InputSequence:
+    for (int j = 0; j < data_T::size; j++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+
+    DenseEmbedding:
+        for (int i = 0; i < CONFIG_T::n_out; i++) {
+            // #pragma HLS UNROLL
+            res_pack[i] = embeddings[in_data[j] * CONFIG_T::n_out + i];
+        }
+        res.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h
new file mode 100644
index 0000000000..7451110fba
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_garnet.h
@@ -0,0 +1,816 @@
+
+#ifndef NNET_GARNET_H_
+#define NNET_GARNET_H_
+
+#include "ac_channel.h"
+#include "hls_math.h"
+#include "nnet_common.h"
+
+namespace nnet {
+namespace garnet_utils {
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ac_int<CONFIG_T::distance_width, false> index_t;
+
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+
+    index_t index;
+    typename CONFIG_T::distance_t distance;
+
+    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
+    edge_weights_table[0] = ac_fixed<CONFIG_T::edge_weight_t::width, 0, false, AC_TRN, AC_SAT>(1.);
+
+    for (unsigned iw = 1; iw < table_size; ++iw) {
+        index = iw;
+        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
+        edge_weights_table[iw] = hls::exp(-distance * distance);
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    typename CONFIG_T::distance_t v = -32.;
+    for (unsigned iw = 0; iw < table_size; ++iw) {
+#ifdef __SYNTHESIS__
+        // hack for now to get through the flow
+        edge_weights_table[iw] = (-v * v);
+#else
+        edge_weights_table[iw] = std::exp(-v * v);
+#endif
+        v += step;
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ac_int<CONFIG_T::distance_width, false> index_t;
+
+    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T>
+inline
+    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    int index = (distance + 32.) / step;
+    if (index < 0)
+        index = 0;
+    else if (index >= table_size)
+        index = table_size - 1;
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
+    if (CONFIG_T::is_stack) {
+        //#pragma HLS INLINE OFF
+    }
+#ifdef __SYNTHESIS__
+    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
+    // CONFIG_T::reuse_factor);
+    // //#pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
+    bool initialized = false;
+#else
+    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    static bool initialized = false;
+#endif
+    if (not initialized) {
+        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
+        initialized = true;
+    }
+
+    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                  exponent_T exponent) {
+    //#pragma HLS INLINE
+    return dividend >> exponent;
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                      exponent_T exponent) {
+    //#pragma HLS INLINE
+    return dividend / std::pow(2., exponent);
+}
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
+
+    Means() {
+        //#pragma HLS INLINE
+        //#pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
+        //#pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = 0.;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] = 0.;
+            }
+        }
+    }
+
+    void set_weight(unsigned, edge_weight_t const &) {
+        //#pragma HLS INLINE
+    }
+
+    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
+        //#pragma HLS INLINE
+        // Always called within a pipelined region - no UNROLL needed
+
+        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        //#pragma HLS INLINE
+
+        // accum comes divided by unroll factor
+        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        //#pragma HLS INLINE
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+
+            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
+            }
+        }
+    }
+};
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+
+    WeightsAndMeans() : Means<CONFIG_T, E>() {
+        //#pragma HLS INLINE
+        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
+        //#pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
+    }
+
+    void set_weight(unsigned iva, edge_weight_t const &weight) {
+        //#pragma HLS INLINE
+        edge_weights[iva] = weight;
+    }
+};
+
+template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t const (&output_biases)[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
+        //#pragma HLS INLINE
+    }
+};
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t output_biases[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const nvtx) {
+        //#pragma HLS ARRAY_PARTITION variable=output_biases complete
+
+        // Cannot add a loop label here due to a Vivado HLS bug, apparently
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
+            bias *= nvtx;
+            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
+        }
+    }
+};
+
+template <class CONFIG_T, class data_T> struct InputDataGetter {
+    typedef data_T data_t;
+
+    data_T const *dataref;
+
+    InputDataGetter(data_T const *d) : dataref{d} {
+        //#pragma HLS INLINE
+    }
+    data_T const &get(unsigned iv, unsigned ix) const {
+        //#pragma HLS INLINE
+        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+        return dataref[ivx];
+    }
+};
+
+template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
+    typedef data_T data_t;
+
+    data_T const (&dataref)[CONFIG_T::n_in_features];
+
+    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
+        //#pragma HLS INLINE
+    }
+    data_T const &get(unsigned, unsigned ix) const {
+        //#pragma HLS INLINE
+        return dataref[ix];
+    }
+};
+
+template <class CONFIG_T, class res_T> struct OutputResSetter {
+    typedef res_T res_t;
+
+    res_T *resref;
+
+    OutputResSetter(res_T *r) : resref{r} {
+        //#pragma HLS INLINE
+    }
+    void set(unsigned iv, unsigned io, res_T const &acc) {
+        //#pragma HLS INLINE
+        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+        resref[ivo] = acc;
+    }
+};
+
+template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
+    typedef res_T res_t;
+
+    res_T (&resref)[CONFIG_T::n_out_features];
+
+    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
+        //#pragma HLS INLINE
+    }
+    void set(unsigned, unsigned io, res_T const &acc) {
+        //#pragma HLS INLINE
+        resref[io] = acc;
+    }
+};
+
+template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
+inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
+                                       arrays_T &arrays) {
+    //#pragma HLS INLINE
+
+Aggregators:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+    InFeatures1:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
+
+            distance += incr;
+        }
+
+        typename CONFIG_T::edge_weight_t edge_weight =
+            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
+
+        arrays_local.edge_weight_mean[ia] += edge_weight;
+
+    InFeatures2:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
+
+            arrays_local.weighted_feature_mean[iax] += incr;
+        }
+
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+        arrays.set_weight(iva, edge_weight);
+    }
+}
+
+template <class CONFIG_T, class arrays_T>
+inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
+    //#pragma HLS INLINE
+
+    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
+
+InFeatures:
+    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
+        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
+    }
+
+    return aggr;
+}
+
+template <class CONFIG_T, class arrays_T>
+inline void compute_output_base(arrays_T const &arrays,
+                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
+    //#pragma HLS INLINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+        }
+    }
+}
+
+template <class CONFIG_T, class arrays_T, class res_setter_T>
+inline void
+compute_vertex_output(arrays_T const &arrays, unsigned iv,
+                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
+                      res_setter_T &res_setter) {
+    //#pragma HLS INLINE
+
+    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=edge_weights complete
+
+Aggregators1:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+        edge_weights[ia] = arrays.edge_weights[iva];
+    }
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
+
+    Aggregators2:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
+            acc += incr;
+        }
+
+        res_setter.set(iv, io, acc);
+    }
+}
+
+template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
+void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
+    InputDataGetter<CONFIG_T, data_T> data_getter(data);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
+void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    OutputResSetter<CONFIG_T, res_T> res_setter(res);
+
+    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=output_base complete
+
+    compute_output_base<CONFIG_T>(arrays, output_base);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
+        }
+    }
+}
+
+template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
+void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
+                res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS PIPELINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        res_T acc = output_transform_biases.output_biases[io];
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+
+            acc += arrays.edge_weight_mean[ia] * aggr;
+        }
+
+        res[io] = acc;
+    }
+}
+
+template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
+void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
+    typedef typename prev_layer_t::output_t data_T;
+
+    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=prev_output_base complete
+
+    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
+
+    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
+
+    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            data_T data[prev_layer_t::n_out_features];
+            //#pragma HLS ARRAY_PARTITION variable=data complete
+
+            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
+
+            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
+
+            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
+
+            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    current_arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    //#pragma HLS INLINE
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    //#pragma HLS INLINE
+
+    WeightsAndMeans<current_layer_t> current_arrays;
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
+
+    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
+}
+} // namespace garnet_utils
+
+struct garnet_config {
+    // Layer specs
+    static const unsigned n_vertices_width = 8;
+    static const unsigned n_vertices = (1 << n_vertices_width);
+    static const unsigned n_in_features = 4;
+    static const unsigned n_propagate = 4;
+    static const unsigned n_aggregators = 4;
+    static const unsigned n_out_features = 4;
+    static const unsigned distance_width = 12;
+
+    // Internal data type definitions
+    typedef float input_transform_weights_t;
+    typedef float input_transform_biases_t;
+    typedef float output_transform_weights_t;
+    typedef float output_transform_biases_t;
+    typedef float aggregator_distance_weights_t;
+    typedef float aggregator_distance_biases_t;
+
+    typedef float norm_t;
+    typedef float distance_t;
+    typedef float edge_weight_t;
+    typedef float edge_weight_aggr_t;
+    typedef float aggr_t;
+    typedef float output_t;
+
+    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
+    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
+    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
+    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
+    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
+
+    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
+
+    static const unsigned output_collapse = no_collapse;
+
+    static const bool mean_by_nvert = false;
+    static const bool is_stack = false;
+
+    // Optimization specs
+    static const unsigned reuse_factor = 64;
+    static const unsigned log2_reuse_factor = 6;
+};
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    garnet_utils::Means<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
+}
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
+}
+
+/* Reference (dumb) implementation returning (Vertices, Features) */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
+
+                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
+            }
+        }
+
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
+            }
+
+            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
+        }
+    }
+
+    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            aggregated_features[iap] = 0.;
+
+            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+                if (iv == nvtx[0])
+                    break;
+
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
+            }
+        }
+    }
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            if (CONFIG_T::mean_by_nvert)
+                aggregated_features[iap] /= nvtx[0];
+            else {
+                // Not using right shift in case aggr_t is float or double
+                aggregated_features[iap] /= CONFIG_T::n_vertices;
+            }
+        }
+    }
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
+
+            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+                typename CONFIG_T::aggr_t aggr = 0.;
+
+                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
+
+                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
+                }
+
+                acc += edge_weights[iva] * aggr;
+            }
+
+            res[ivo] = acc;
+        }
+    }
+}
+
+/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_out_features]) {
+    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
+
+    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
+
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename CONFIG_T::aggr_t acc = 0.;
+
+        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+            if (iv == nvtx[0])
+                break;
+
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            acc += vertex_res[ivo];
+        }
+
+        if (CONFIG_T::mean_by_nvert)
+            acc /= nvtx[0];
+        else {
+            // Not using right shift in case aggr_t is float or double
+            acc /= CONFIG_T::n_vertices;
+        }
+
+        res[io] = acc;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h b/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h
new file mode 100644
index 0000000000..ed701e5c59
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_helpers.h
@@ -0,0 +1,461 @@
+
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include "ac_channel.h"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+extern const char *get_weights_dir();
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR get_weights_dir()
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            // CATAPULT_PORT
+            // std::istringstream(token) >> w[i];
+            double tmp;
+            std::istringstream(token) >> tmp;
+            w[i] = tmp;
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            double sign;
+            double weight;
+            if (!(structss >> sign >> weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            w[i].sign = sign;
+            w[i].weight = weight;
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <int Twidth, int Ibits, bool Signed, ac_q_mode Qmode, ac_o_mode Omode>
+void convert_single_data(ac_fixed<Twidth, Ibits, Signed, Qmode, Omode> &src, double &dst) {
+    dst = src.to_double();
+}
+template <int Twidth, int Ibits, bool Signed, ac_q_mode Qmode, ac_o_mode Omode>
+void convert_single_data(ac_fixed<Twidth, Ibits, Signed, Qmode, Omode> &src, float &dst) {
+    dst = src.to_double();
+}
+template <class srcType, class dstType> void convert_single_data(srcType &src, dstType &dst) { dst = dstType(src); }
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        convert_single_data(src[i], dst[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, ac_channel<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(ac_channel<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j].to_double()); // this may only work for ac_fixed
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(ac_channel<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+template <class data_T> void save_output_array(ac_channel<data_T> &data, float *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = ctype[j].to_double();
+        }
+        data.write(ctype);
+    }
+}
+
+template <class data_T> void save_output_array(ac_channel<data_T> &data, double *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = ctype[j].to_double();
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(ac_channel<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << ctype[j].to_double();
+                out << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, ac_channel<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (unsigned i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(ac_channel<res_T> &result, std::ostream &out, bool keep = false) {
+    if (!keep) {
+        while (result.available(1)) {
+            res_T res_pack = result.read();
+            for (unsigned int j = 0; j < res_T::size; j++) {
+                out << res_pack[j] << " ";
+            }
+        }
+        out << std::endl;
+    } else {
+        if (result.debug_size() >= SIZE / res_T::size) {
+            for (unsigned int i = 0; i < SIZE / res_T::size; i++) {
+                res_T res_pack = result[i]; // peek
+                for (unsigned int j = 0; j < res_T::size; j++) {
+                    out << res_pack[j] << " ";
+                }
+            }
+            out << std::endl;
+        }
+    }
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(ac_channel<data_T> &data) {
+    for (unsigned int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (unsigned int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+// Fix for CAT-36531
+template <class data_T, size_t SIZE> void fill_random(data_T data[SIZE]) {
+    // std::cout << "Fill_Random SIZE:"<< SIZE << std::endl;
+    data_T MAX_VALUE;
+    for (unsigned int i = 0; i < SIZE; i++) {
+        // Generate a random value (for example, between 0 and 1)
+        data_T random_value = (data_T)rand() / MAX_VALUE.template set_val<AC_VAL_MIN>();
+        data[i] = random_value;
+    }
+}
+
+template <class data_T, size_t SIZE> void fill_random(ac_channel<data_T> &data) {
+    typedef typename data_T::value_type base_T;
+    base_T MAX_VALUE;
+    for (unsigned int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (unsigned int j = 0; j < data_T::size; j++) {
+            // Generate a random value (for example, between 0 and 1)
+            base_T random_value = (base_T)rand() / MAX_VALUE.template set_val<AC_VAL_MIN>();
+            data_pack[j] = random_value;
+        }
+        data.write(data_pack);
+    }
+    // std::cout << "Fill_Random AC_CHANNEL" << std::endl;
+}
+
+template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        if (fscanf(fp, "%f\n", &newval) != 0) {
+            data[ii] = newval;
+        } else {
+            return -2;
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class dataType, unsigned int nrows, unsigned int ncols>
+int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        for (int jj = 0; jj < ncols; jj++) {
+            if (fscanf(fp, "%f\n", &newval) != 0) {
+                data[ii][jj] = newval;
+            } else {
+                return -2;
+            }
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class in_T, class out_T, int N_IN> void change_type(ac_channel<in_T> &in, ac_channel<out_T> &out) {
+    in_T datareg;
+    ac_channel<out_T> input_trunc;
+    for (int ii = 0; ii < N_IN; ii++) {
+        out << (out_T)in.read();
+    }
+}
+
+template <class data_T, int N_IN> void hls_stream_debug(ac_channel<data_T> &data, ac_channel<data_T> &res) {
+    data_T datareg;
+    for (int ii = 0; ii < N_IN; ii++) {
+        datareg = data.read();
+        std::cout << "[" << ii << "]: " << datareg << std::endl;
+        res << datareg;
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_image.h b/hls4ml/templates/catapult/nnet_utils/nnet_image.h
new file mode 100755
index 0000000000..26947fae01
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_image.h
@@ -0,0 +1,41 @@
+#ifndef NNET_IMAGE_H_
+#define NNET_IMAGE_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+struct resize_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned n_chan = 10;
+    static const unsigned new_height = 10;
+    static const unsigned new_width = 10;
+};
+
+template <class data_T, typename CONFIG_T>
+void resize_nearest(data_T image[CONFIG_T::height * CONFIG_T::width * CONFIG_T::n_chan],
+                    data_T resized[CONFIG_T::new_height * CONFIG_T::new_width * CONFIG_T::n_chan]) {
+    int y_ratio = (int)((CONFIG_T::height << 16) / CONFIG_T::new_height) + 1;
+    int x_ratio = (int)((CONFIG_T::width << 16) / CONFIG_T::new_width) + 1;
+    int x2, y2;
+
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::new_height; i++) {
+        for (int j = 0; j < CONFIG_T::new_width; j++) {
+            x2 = ((j * x_ratio) >> 16);
+            y2 = ((i * y_ratio) >> 16);
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                resized[(i * CONFIG_T::new_width * CONFIG_T::n_chan) + j * CONFIG_T::n_chan + k] =
+                    image[(y2 * CONFIG_T::width * CONFIG_T::n_chan) + x2 * CONFIG_T::n_chan + k];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h
new file mode 100644
index 0000000000..1757f7bfb8
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_image_stream.h
@@ -0,0 +1,66 @@
+#ifndef NNET_IMAGE_STREAM_H_
+#define NNET_IMAGE_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> void resize_nearest(ac_channel<data_T> &image, ac_channel<data_T> &resized) {
+    assert(CONFIG_T::new_height % CONFIG_T::height == 0);
+    assert(CONFIG_T::new_width % CONFIG_T::width == 0);
+    constexpr unsigned ratio_height = CONFIG_T::new_height / CONFIG_T::height;
+    constexpr unsigned ratio_width = CONFIG_T::new_width / CONFIG_T::width;
+
+ImageHeight:
+    for (unsigned h = 0; h < CONFIG_T::height; h++) {
+        //#pragma HLS PIPELINE
+
+        data_T data_in_row[CONFIG_T::width];
+
+    ImageWidth:
+        for (unsigned i = 0; i < CONFIG_T::width; i++) {
+            //#pragma HLS UNROLL
+
+            data_T in_data = image.read();
+
+        ImageChan:
+            for (unsigned j = 0; j < CONFIG_T::n_chan; j++) {
+                //#pragma HLS UNROLL
+
+                data_in_row[i][j] = in_data[j];
+            }
+        }
+
+    ResizeHeight:
+        for (unsigned i = 0; i < ratio_height; i++) {
+            //#pragma HLS UNROLL
+
+        ImageWidth2:
+            for (unsigned l = 0; l < CONFIG_T::width; l++) {
+                //#pragma HLS UNROLL
+
+            ResizeWidth:
+                for (unsigned j = 0; j < ratio_width; j++) {
+                    //#pragma HLS UNROLL
+
+                    data_T out_data;
+                    //#pragma HLS DATA_PACK variable=out_data
+
+                ResizeChan:
+                    for (unsigned k = 0; k < CONFIG_T::n_chan; k++) {
+                        //#pragma HLS UNROLL
+
+                        out_data[k] = data_in_row[l][k];
+                    }
+
+                    resized.write(out_data);
+                }
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_math.h b/hls4ml/templates/catapult/nnet_utils/nnet_math.h
new file mode 100644
index 0000000000..c25f7187b6
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_math.h
@@ -0,0 +1,178 @@
+#ifndef NNET_MATH_H_
+#define NNET_MATH_H_
+
+#include "hls_math.h"
+
+namespace nnet {
+
+// This header defines the functions that return type different from the input
+// For example, hls::sin(x) returns ac_fixed<W-I+2,2,true>
+// By ensuring we return the same type we can avoid casting issues in expressions
+
+template <typename T> T sin(T x) { return (T)hls::sin(x); };
+
+template <typename T> T cos(T x) { return (T)hls::cos(x); };
+
+template <typename T> T asin(T x) { return (T)hls::asin(x); };
+
+template <typename T> T acos(T x) { return (T)hls::acos(x); };
+
+template <typename T> T atan(T x) { return (T)hls::atan(x); };
+
+template <typename T> T atan2(T x, T y) { return (T)hls::atan2(x, y); };
+
+template <class T, int W, int I> void init_sincos_table(T table[1 << (W - I - 3)][2]) {
+    unsigned int NTE = 1 << (W - I - 3); // No of table entries
+    double step = M_PI / (4 * NTE);      // Interval between angles
+    double y = 0;
+    // double scaled_angle = 0;
+
+    for (unsigned int i = 0; i < NTE; i++) {
+        table[i][0] = std::cos(y);
+        table[i][1] = std::sin(y);
+        y += step;
+        // scaled_angle = y/(2*M_PI);
+        // printf("cos(%f) = %23.22f, sin(%f) = %23.22f index = %d, scaled angle = %13.12f \n", y, cos(y), y, sin(y), i,
+        // scaled_angle);
+    }
+}
+
+template <class T> void sincos_lut(const T &input, T output[2]) {
+
+    #pragma HLS INLINE
+
+    // This implementation is based on ac_sincos_lut.h from AC math library
+
+    static bool flag = true;
+    if (flag && T::width - T::iwidth > 12) {
+#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+        std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+        std::cout << "Warning: The output of sincos_lut will not be accurate" << std::endl;
+#endif
+        flag = false;
+    }
+    // Datatype for lookup table entries
+    typedef ac_fixed<T::width, T::iwidth, false, AC_RND> luttype;
+    // Datatype for posinput which is used to handle negative inputs
+    typedef ac_fixed<T::width - T::iwidth, 0, false> posinputtype;
+
+    typedef ac_int<9, false> lutindextype; // 9 bits required for indexing into 512 entry table
+    typedef ac_int<3, false> octanttype;   // 3 bits required for octant value range of 0 thru 7
+    T outputtemp[2];
+    lutindextype luTdex = 0;
+    posinputtype posinput = input;
+
+    // Initialize the lookup table
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    luttype sincos[512][2];
+#else
+    static bool initialized = false;
+    static luttype sincos[512][2];
+#endif
+    if (!initialized) {
+        init_sincos_table<luttype, 12, 0>(sincos);
+        initialized = true;
+    }
+
+    // Leaving this commented out makes the table to to BRAM
+    //#pragma HLS ARRAY_PARTITION variable=sincos complete dim=0
+
+    typedef ac_int<AP_MAX(T::width - T::iwidth - 3, 1), false> lutindextype1;
+    // Extracting (MSB-3:LSB) bits of scaled input to determine the lookup table index
+    lutindextype1 luTdex1 = posinput.range(AP_MAX(T::width - T::iwidth - 3, 1), 0); // Extracting the lookup table index
+
+    if (T::width - T::iwidth >= 4 && T::width - T::iwidth <= 12) {
+        luTdex(8, 12 - (T::width - T::iwidth)) = luTdex1; // stride
+    }
+    // Approximation for the scaled inputs whose number of bits are greater than 12
+    else if (T::width - T::iwidth > 12) {
+        // Lookup table index for the scaled inputs whose number of bits are greater than 12
+        luTdex = luTdex1 / (1 << (AP_MAX(T::width - T::iwidth - 12, 0)));
+        if ((luTdex1 % (1 << (AP_MAX(T::width - T::iwidth - 12, 0)))) > (1 << (AP_MAX(T::width - T::iwidth - 13, 0)))) {
+            luTdex = luTdex + 1;
+        }
+        typedef ac_fixed<AP_MAX((AP_MAX(T::width - T::iwidth - 3, 1) + T::width - T::iwidth - 12), 1),
+                         AP_MAX(T::width - T::iwidth - 3, 1), false>
+            datatype;
+        datatype x = (datatype)luTdex1;
+        x = x >> AP_MAX(T::width - T::iwidth - 12, 0);
+        if (x > 511.5) {
+            luTdex = 511;
+        }
+        if (luTdex1 <= 1 << (AP_MAX(T::width - T::iwidth - 13, 0)) && luTdex1 != 0) {
+            luTdex = 1;
+        }
+    }
+
+    if (T::width - T::iwidth >= 3) {
+        // Getting the octant 0-7 by extracting the first 3 bits from MSB side of scaled input where
+        //   octant 0 corresponds to [0-PI/4),
+        //   octant 1 corresponds to [PI/4-2PI/4),
+        //   octant 2 corresponds to [2PI/4-3PI/4) and so on
+        // octanttype octant = posinput.template slc<3>(T::width-T::iwidth-3);
+        octanttype octant = posinput(T::width - T::iwidth - 1, T::width - T::iwidth - 3);
+        luTdex = (octant[0] == 1) ? (lutindextype)(512 - luTdex) : (lutindextype)(luTdex);
+        // imaginary part is sine
+        outputtemp[1] = ((octant == 0) | (octant == 3))   ? (T)sincos[luTdex][1]
+                        : ((octant == 2) | (octant == 1)) ? (T)sincos[luTdex][0]
+                        : ((octant == 7) | (octant == 4)) ? (T)-sincos[luTdex][1]
+                                                          : (T)-sincos[luTdex][0];
+        // real part is cosine
+        outputtemp[0] = ((octant == 6) | (octant == 1))   ? (T)sincos[luTdex][1]
+                        : ((octant == 3) | (octant == 4)) ? (T)-sincos[luTdex][0]
+                        : ((octant == 2) | (octant == 5)) ? (T)-sincos[luTdex][1]
+                                                          : (T)sincos[luTdex][0];
+        // Below two are the cases when the output corresponds to + or - (0 or 1) for which there is no entry in the lookup
+        // table
+        output[1] = ((posinput == 0.125) | (posinput == 0.375))   ? T(0.7071067811865475244008)
+                    : ((posinput == 0.625) | (posinput == 0.875)) ? T(-0.7071067811865475244008)
+                                                                  : outputtemp[1];
+        output[0] = ((posinput == 0.125) | (posinput == 0.875))   ? T(0.7071067811865475244008)
+                    : ((posinput == 0.375) | (posinput == 0.625)) ? T(-0.7071067811865475244008)
+                                                                  : outputtemp[0];
+    }
+
+    if (T::width - T::iwidth <= 2) {
+        output[1] = (posinput == 0)      ? (T)0
+                    : (posinput == 0.25) ? (T)1
+                    : (posinput == 0.5)  ? (T)0
+                    : (posinput == 0.75) ? (T)-1
+                                         : outputtemp[1];
+        output[0] = (posinput == 0)      ? (T)1
+                    : (posinput == 0.25) ? (T)0
+                    : (posinput == 0.5)  ? (T)-1
+                    : (posinput == 0.75) ? (T)0
+                                         : outputtemp[0];
+    }
+
+#if !defined(__SYNTHESIS__) && defined(SINCOS_LUT_DEBUG)
+    std::cout << "FILE : " << __FILE__ << ", LINE : " << __LINE__ << std::endl;
+    std::cout << "============AP_FIXED SINCOS======================" << std::endl;
+    std::cout << "positive input is   = " << posinput << std::endl;
+    std::cout << "lut index is   = " << luTdex << std::endl;
+    std::cout << "sin value is    = " << output[1] << std::endl;
+    std::cout << "cos value is    = " << output[0] << std::endl;
+    std::cout << "=================================================" << std::endl;
+#endif
+}
+
+template <class T> T sin_lut(const T input) {
+    #pragma HLS INLINE
+    T sincos_res[2];
+    T scaled_input = input * ac_fixed<16, 0, false>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[1];
+}
+
+template <class T> T cos_lut(const T input) {
+    #pragma HLS INLINE
+    T sincos_res[2];
+    T scaled_input = input * ac_fixed<16, 0, false>(0.15915494309); // 1/(2*pi)
+    sincos_lut(scaled_input, sincos_res);
+    return sincos_res[0];
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
new file mode 100644
index 0000000000..00c2cf5e12
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
@@ -0,0 +1,232 @@
+
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    typedef float accum_t;
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const int axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] + data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] - data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] * data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    CONFIG_T::template product<input1_T, input2_T>::limit(multiplier_limit);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    //#pragma HLS ARRAY_PARTITION variable=mult complete
+    typename CONFIG_T::accum_t acc = 0;
+
+Product:
+    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
+        // #pragma HLS UNROLL
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
+    }
+
+Accum:
+    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
+        // #pragma HLS UNROLL
+        acc += mult[i_acc];
+    }
+
+    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
+        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
+                data2[ii * CONFIG_T::n_elem2_1 + jj];
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx =
+                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
+                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h
new file mode 100644
index 0000000000..ef0d542fc0
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge_stream.h
@@ -0,0 +1,380 @@
+
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    AddPack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = in_data1[j] + in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    SubtractPack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = in_data1[j] - in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+MultiplyLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    MultiplyPack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = in_data1[j] * in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+AverageLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    AveragePack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+MaximumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    MaximumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+MinimumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    MinimumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                // #pragma HLS UNROLL
+                out_data[input1_T::size + k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        // pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            // #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        //#pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            // #pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        //#pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            // #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            // #pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(ac_channel<input1_T> &data1, ac_channel<input2_T> &data2, ac_channel<res_T> &res) {
+    res_T out_data;
+//#pragma HLS DATA_PACK variable=out_data
+ConcatLoop1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+    ConcatPack1:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = in_data1[j];
+        }
+        res.write(out_data);
+    }
+ConcatLoop2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        //#pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+    ConcatPack2:
+        for (int j = 0; j < res_T::size; j++) {
+            // #pragma HLS UNROLL
+            out_data[j] = in_data2[j];
+        }
+        res.write(out_data);
+    }
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_mult.h b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h
new file mode 100755
index 0000000000..7379eec489
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_mult.h
@@ -0,0 +1,127 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {} // Nothing to do here
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        //#pragma HLS INLINE
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        //#pragma HLS INLINE
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        //#pragma HLS INLINE
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        //#pragma HLS INLINE
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        //#pragma HLS INLINE
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        //#pragma HLS INLINE
+        //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    // Construct the return type from the multiplication equivalent to the largest shifts
+    // ap_int<pow2(decltype(w_T::weight)::width-1)-1> is the type if the multiplicand equivalent to the largest lshift <<
+    // ap_fixed<pow2(decltype(w_T::weight)::width-1)-1,0> is the type of the multiplicand equivalent to the largest rshift >>
+    using r_T = decltype(x_T(0) * (ac_int<pow2(decltype(w_T::weight)::width - 1) - 1, true>(1) +
+                                   ac_fixed<pow2(decltype(w_T::weight)::width - 1) - 1, 0, true>(1)));
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        //#pragma HLS INLINE
+        // shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+        // negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_padding.h b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h
new file mode 100755
index 0000000000..47986523fb
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_padding.h
@@ -0,0 +1,145 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int j = 0; j < CONFIG_T::n_chan; j++) {
+        for (int i = 0; i < CONFIG_T::pad_left; i++) {
+            *(res++) = 0;
+        }
+
+        for (int i = 0; i < CONFIG_T::in_width; i++) {
+            *(res++) = (res_T) * (data++);
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_right; i++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int k = 0; k < CONFIG_T::n_chan; k++) {
+
+        for (int i = 0; i < CONFIG_T::pad_top; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::in_height; i++) {
+            for (int j = 0; j < CONFIG_T::pad_left; j++) {
+                *(res++) = 0;
+            }
+            for (int j = 0; j < CONFIG_T::in_width; j++) {
+                *(res++) = (res_T) * (data++);
+            }
+            for (int j = 0; j < CONFIG_T::pad_right; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h
new file mode 100644
index 0000000000..9c11683746
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_padding_stream.h
@@ -0,0 +1,95 @@
+#ifndef NNET_PADDING_STREAM_H_
+#define NNET_PADDING_STREAM_H_
+
+#include <math.h>
+
+namespace nnet {
+
+template <class res_T, typename CONFIG_T> void fill_zero(ac_channel<res_T> &res) {
+    //#pragma HLS INLINE
+    res_T res_part;
+    for (unsigned int c = 0; c < CONFIG_T::n_chan; c++) {
+        //#pragma HLS UNROLL
+        res_part[c] = 0;
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void fill_data(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    //#pragma HLS INLINE
+    data_T data_part = data.read();
+    res_T res_part;
+    for (unsigned int c = 0; c < CONFIG_T::n_chan; c++) {
+        //#pragma HLS UNROLL
+        res_part[c] = data_part[c];
+    }
+    res.write(res_part);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+PadLeft:
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+
+CopyMain:
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        fill_data<data_T, res_T, CONFIG_T>(data, res);
+    }
+
+PadRight:
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        fill_zero<res_T, CONFIG_T>(res);
+    }
+}
+
+// Description:
+//   apply zero padding to input feature data "data" based on
+//   padding parameters in CONFIG_T
+//
+//                  CONFIG_T::pad_top
+//    CONFIG_T::pad_left  "data"  CONFIG_T::pad_right
+//                  CONFIG_T::pad_bottom
+//
+// Template Params:
+//    data_T - typically nnet::array< ac_fixed<>, 3*1> (see myproject.cpp -> firmware/defines.h)
+//    res_T  - typically nnet::array< ac_fixed<>, 3*1>
+
+template <class data_T, class res_T, typename CONFIG_T> void zeropad2d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+
+PadTop:
+    for (unsigned i = 0; i < CONFIG_T::pad_top; i++) {
+    PadTopWidth:
+        for (unsigned j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadMain:
+    for (unsigned i = 0; i < CONFIG_T::in_height; i++) {
+    PadLeft:
+        for (unsigned j = 0; j < CONFIG_T::pad_left; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    CopyMain:
+        for (unsigned j = 0; j < CONFIG_T::in_width; j++) {
+            fill_data<data_T, res_T, CONFIG_T>(data, res);
+        }
+    PadRight:
+        for (unsigned j = 0; j < CONFIG_T::pad_right; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+
+PadBottom:
+    for (unsigned i = 0; i < CONFIG_T::pad_bottom; i++) {
+    PadBottomWidth:
+        for (unsigned j = 0; j < CONFIG_T::out_width; j++) {
+            fill_zero<res_T, CONFIG_T>(res);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h
new file mode 100644
index 0000000000..82e281023b
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling.h
@@ -0,0 +1,362 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+// Return the maximum value from an array
+template <typename T, int N> T max(T x[N]) {
+    T y = x[0];
+    for (int i = 1; i < N; i++) {
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+template <int W, int N> ac_int<W, true> avg(ac_int<W, true> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ac_int<W + ceillog2(N), true> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ac_int<W, true> y = tmp;
+    return tmp;
+}
+
+template <int W, int I, int N> ac_fixed<W, I, true> avg(ac_fixed<W, I, true> (&x)[N]) {
+    // Use a wider accumulator than the input to avoid overflow
+    ac_fixed<W + ceillog2(N), I + ceillog2(N), true> tmp = 0;
+    for (int i = 0; i < N; i++) {
+        tmp += x[i];
+    }
+    tmp /= N;
+    // Now cast back to original type
+    ac_fixed<W, I, true> y = tmp;
+    return y;
+}
+
+// Return the mean value of an array
+template <typename T, int N> T avg(T (&x)[N]) {
+    T y = 0;
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+    y /= N;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template <typename T, int N, Pool_Op op> T pool_op(T (&x)[N]) {
+    switch (op) {
+    case Max:
+        return max<T, N>(x);
+    case Average:
+        return avg(x);
+        // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template <typename T, Pool_Op op> T pad_val() {
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) {
+            data_T pool[CONFIG_T::pool_width];
+            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+            // Keep track of number of pixels in image vs padding region
+            unsigned img_overlap = 0;
+            // Loop over pool window x
+            for (int jj = 0; jj < CONFIG_T::stride_width; jj++) {
+                if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) {
+                    // Add padding
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+                    if (CONFIG_T::count_pad) {
+                        img_overlap++;
+                    }
+                } else {
+                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                    img_overlap++;
+                }
+            }
+            // do the pooling
+            // TODO in the case of average pooling, need to reduce width to area of pool window
+            // not overlapping padding region
+            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+            // If the pool op is Average, the zero-padding needs to be removed from the results
+            if (CONFIG_T::pool_op == Average) {
+                data_T rescale = static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool);
+    }
+}
+
+struct pooling2d_config {
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit() {
+    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad) {
+                                img_overlap++;
+                            }
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt +
+                                     (jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                        (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+    unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
+        padded_height -= padded_height - (padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height);
+        padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
+    }
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad) {
+                                img_overlap++;
+                            }
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+
+FiltLoop:
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h
new file mode 100644
index 0000000000..051a27a54b
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_pooling_stream.h
@@ -0,0 +1,601 @@
+#ifndef NNET_POOLING_STREAM_H_
+#define NNET_POOLING_STREAM_H_
+
+// #include "utils/x_hls_utils.h"
+#include "ac_channel.h"
+#include "ap_shift_reg.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include "nnet_pooling.h"
+
+namespace nnet {
+
+// *************************************************
+//       Max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T> T reduce_pool(T x[N]) {
+    //#pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        return reduce<T, N, Op_max<T>>(x, op_max);
+    } else {
+        Op_add<T> op_add;
+        T sum = reduce<T, N, Op_add<T>>(x, op_add);
+        return sum / N;
+    }
+}
+
+template <unsigned TABLE_SIZE, unsigned POOL_SIZE> void init_pool_table(unsigned table[TABLE_SIZE]) {
+    for (unsigned ii = 0; ii < TABLE_SIZE; ii++) {
+        table[ii] = ii % POOL_SIZE;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_encoded_2d(
+    const unsigned h_idx, const unsigned w_idx, const data_T &in_elem,
+    ac_channel<typename data_T::value_type> data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt],
+    ac_channel<res_T> &res, res_T &res_pack, unsigned &outputs_ready) {
+    // Nearest H without unused pixels on the right
+    constexpr unsigned nH =
+        ((CONFIG_T::in_height - CONFIG_T::pool_height) / CONFIG_T::stride_height) * CONFIG_T::stride_height +
+        CONFIG_T::pool_height;
+    // Scaled H that behaves like original H
+    constexpr unsigned sH =
+        (DIV_ROUNDUP(CONFIG_T::pool_height, CONFIG_T::stride_height) - 1) * CONFIG_T::stride_height + CONFIG_T::pool_height;
+    // Nearest W without unused pixels on the right
+    constexpr unsigned nW = ((CONFIG_T::in_width - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width +
+                            CONFIG_T::pool_width;
+    // Scaled W that behaves like original W
+    constexpr unsigned sW =
+        (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    unsigned pool_table_height[CONFIG_T::in_height];
+    unsigned pool_table_width[CONFIG_T::in_width];
+#else
+    static bool initialized = false;
+    static unsigned pool_table_height[CONFIG_T::in_height];
+    static unsigned pool_table_width[CONFIG_T::in_width];
+#endif
+    if (!initialized) {
+        init_pool_table<CONFIG_T::in_height, CONFIG_T::pool_height>(pool_table_height);
+        init_pool_table<CONFIG_T::in_width, CONFIG_T::pool_width>(pool_table_width);
+        initialized = true;
+    }
+
+    //#pragma HLS INLINE
+
+    if (data_T::size / CONFIG_T::n_filt > 1) {
+        //#pragma HLS ARRAY_PARTITION variable=pool_table_height complete
+        //#pragma HLS ARRAY_PARTITION variable=pool_table_width complete
+    }
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    //#pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    const unsigned sh_idx = pool_table_height[h_idx] * CONFIG_T::pool_width;
+    const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt);
+PixelLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+        //#pragma HLS PIPELINE
+
+        ac_int<CONFIG_T::pool_height * CONFIG_T::pool_width, false> filt_mask = 0;
+        if ((h_idx < nH) && (wp_idx + p < nW)) {
+            filt_mask = sh_idx + pool_table_width[wp_idx + p] + 1;
+        }
+    CopyDataFilt:
+        for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            if (filt_mask > 0)
+                data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(
+                    in_elem[p * CONFIG_T::n_filt + c]);
+        }
+
+        if (filt_mask == CONFIG_T::pool_height * CONFIG_T::pool_width) {
+        FiltLoop:
+            for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            PoolLoop:
+                for (unsigned f = 0; f < CONFIG_T::pool_height * CONFIG_T::pool_width; f++) {
+                    pool_window[f] = data_window[c * CONFIG_T::pool_height * CONFIG_T::pool_width + f].read();
+                }
+                if (res_T::size / CONFIG_T::n_filt ==
+                    1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                    res_pack[c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(
+                            pool_window);
+                } else {
+                    res_pack[outputs_ready * CONFIG_T::n_filt + c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(
+                            pool_window);
+                }
+            }
+            if (res_T::size / CONFIG_T::n_filt ==
+                1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                res.write(res_pack);
+            } else {
+                if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+                    res.write(res_pack);
+                    outputs_ready = 0;
+                } else {
+                    outputs_ready++;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_encoded_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    unsigned outputs_ready = 0;
+
+    static ac_channel<typename data_T::value_type>
+        data_window[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    //  constexpr int win_depth = CONFIG_T::pool_height * CONFIG_T::out_width;
+    //  for (unsigned i_out = 0; i_out < CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) {
+    //      #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    //  }
+
+    constexpr int pack_factor = (data_T::size / CONFIG_T::n_filt) * (res_T::size / CONFIG_T::n_filt == 1);
+    (void)pack_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (pack_factor); i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            if (res_T::size / CONFIG_T::n_filt == 1) {
+                //#pragma HLS PIPELINE II=pack_factor
+            }
+            compute_pool_encoded_2d<data_T, res_T, CONFIG_T>(i_ih, i_iw, data.read(), data_window, res, res_pack,
+                                                             outputs_ready);
+        }
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's)
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_2d(const data_T &in_elem,
+                            ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                line_buffer[MAX(CONFIG_T::pool_height - 1, 1)][CONFIG_T::n_filt],
+                            ac_channel<res_T> &res) {
+    //#pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    const static int lShiftY = CONFIG_T::pool_height - 1;
+    static int pX = 0; // pixel X
+    static int pY = 0; // pixel Y
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_height * CONFIG_T::pool_width];
+    //#pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_height * CONFIG_T::pool_width * CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+
+    // Add pixel into line buffer, return pooling kernels
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+    FiltLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+        //#pragma HLS PIPELINE
+
+        // Retrieve data for current channel
+        PoolLoop:
+            for (unsigned i_ihw = 0; i_ihw < CONFIG_T::pool_height * CONFIG_T::pool_width; i_ihw++) {
+                pool_window[i_ihw] = kernel_data[i_ihw * CONFIG_T::n_filt + i_ic];
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] =
+                reduce_pool<typename data_T::value_type, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T>(
+                    pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else { // Next line
+            pY = pY + 1;
+            // Update stride (threshold) ? subtract stride : increment stride
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_buffer_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[MAX(CONFIG_T::pool_height - 1, 1)]
+                                                                                    [CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            //#pragma HLS PIPELINE
+
+            compute_pool_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling2d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    //#pragma HLS inline region
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        pooling2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case conv_implementation::encoded:
+        pooling2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//                  Pooling 1D
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_encoded_1d(const unsigned w_idx, const data_T &in_elem,
+                             ac_channel<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt],
+                             ac_channel<res_T> &res, res_T &res_pack, unsigned &outputs_ready) {
+    // Nearest W without unused pixels on the right
+    constexpr unsigned nW =
+        ((CONFIG_T::n_in - CONFIG_T::pool_width) / CONFIG_T::stride_width) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+    // Scaled W that behaves like original W
+    constexpr unsigned sW =
+        (DIV_ROUNDUP(CONFIG_T::pool_width, CONFIG_T::stride_width) - 1) * CONFIG_T::stride_width + CONFIG_T::pool_width;
+
+#ifdef __SYNTHESIS__
+    bool initialized = false;
+    unsigned pool_table_width[CONFIG_T::n_in];
+#else
+    static bool initialized = false;
+    static unsigned pool_table_width[CONFIG_T::n_in];
+#endif
+    if (!initialized) {
+        init_pool_table<CONFIG_T::n_in, CONFIG_T::pool_width>(pool_table_width);
+        initialized = true;
+    }
+
+    //#pragma HLS INLINE
+
+    if (data_T::size / CONFIG_T::n_filt > 1) {
+        //#pragma HLS ARRAY_PARTITION variable=pool_table_width complete
+    }
+
+    typename CONFIG_T::accum_t pool_window[CONFIG_T::pool_width];
+    //#pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    const unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_filt);
+
+PixelLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+        //#pragma HLS PIPELINE
+
+        ac_int<CONFIG_T::pool_width, false> filt_mask = 0;
+        if (wp_idx + p < nW) {
+            filt_mask = pool_table_width[wp_idx + p] + 1;
+        }
+
+    CopyDataFilt:
+        for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            if (filt_mask > 0)
+                data_window[c * CONFIG_T::pool_width + filt_mask.to_uint() - 1].write(in_elem[p * CONFIG_T::n_filt + c]);
+        }
+
+        if (filt_mask == CONFIG_T::pool_width) {
+        FiltLoop:
+            for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+            PoolLoop:
+                for (unsigned f = 0; f < CONFIG_T::pool_width; f++) {
+                    pool_window[f] = data_window[c * CONFIG_T::pool_width + f].read();
+                }
+                if (res_T::size / CONFIG_T::n_filt ==
+                    1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                    res_pack[c] = reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+                } else {
+                    res_pack[outputs_ready * CONFIG_T::n_filt + c] =
+                        reduce_pool<typename CONFIG_T::accum_t, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+                }
+            }
+            if (res_T::size / CONFIG_T::n_filt ==
+                1) { // Saves resources if we don't pack output, compiler will remove the else branch
+                res.write(res_pack);
+            } else {
+                if (outputs_ready == (res_T::size / CONFIG_T::n_filt) - 1) {
+                    res.write(res_pack);
+                    outputs_ready = 0;
+                } else {
+                    outputs_ready++;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_encoded_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    unsigned outputs_ready = 0;
+
+    ac_channel<typename data_T::value_type> data_window[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    //  constexpr int win_depth = CONFIG_T::n_out;
+    //  for (unsigned i_out = 0; i_out < CONFIG_T::pool_width * CONFIG_T::n_filt; i_out++) {
+    //      #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    //  }
+
+    constexpr int pack_factor = data_T::size / CONFIG_T::n_filt;
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (pack_factor); i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (res_T::size / CONFIG_T::n_filt == 1) {
+            //#pragma HLS PIPELINE II=pack_factor
+        }
+        compute_pool_encoded_1d<data_T, res_T, CONFIG_T>(i_iw, data.read(), data_window, res, res_pack, outputs_ready);
+    }
+}
+
+// *************************************************
+//       Line Buffer Implementation (Phil's) 1D
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_pool_buffer_1d(const data_T &in_elem, ac_channel<res_T> &res) {
+    //#pragma HLS INLINE
+    const static int lShiftX = CONFIG_T::pool_width - 1;
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    typename data_T::value_type pool_window[CONFIG_T::pool_width];
+    //#pragma HLS ARRAY_PARTITION variable=pool_window complete
+
+    static typename data_T::value_type kernel_data[CONFIG_T::pool_width * CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable = kernel_data complete dim = 0
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+
+    // Add pixel into line buffer, return pooling kernels
+    // 1D case line buffer not necessary. Put directly into the kernel_data buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Can compute pooling output
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+    FiltLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+        //#pragma HLS PIPELINE
+
+        // Retrieve data for current channel
+        PoolLoop:
+            for (unsigned i_iw = 0; i_iw < CONFIG_T::pool_width; i_iw++) {
+                pool_window[i_iw] = kernel_data[i_iw * CONFIG_T::n_filt + i_ic];
+            }
+
+            // Compute Pooling
+            res_pack[i_ic] = reduce_pool<typename data_T::value_type, CONFIG_T::pool_width, CONFIG_T>(pool_window);
+        }
+
+        // Write to output
+        res.write(res_pack);
+    }
+
+    // Counter Housekeeping
+    if (pX + 1 == CONFIG_T::n_in) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        // Update stride (threshold) ? subtract stride : increment stride
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_buffer_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in; i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        //#pragma HLS PIPELINE
+        compute_pool_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void pooling1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    //#pragma HLS inline region
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        pooling1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case conv_implementation::encoded:
+        pooling1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       Global max/average pooling
+// *************************************************
+
+template <class T, int N, class CONFIG_T> T reduce_global_pool(T x, T y[N]) {
+    //#pragma HLS INLINE
+    if (CONFIG_T::pool_op == Max) {
+        Op_max<T> op_max;
+        T y_max = reduce<T, N, Op_max<T>>(y, op_max);
+        return (x > y_max) ? x : y_max;
+    } else {
+        Op_add<T> op_add;
+        T y_sum = reduce<T, N, Op_add<T>>(y, op_add);
+        return x + y_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_global_pool(const data_T &in_elem, typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt]) {
+PoolFilt:
+    for (unsigned c = 0; c < CONFIG_T::n_filt; c++) {
+
+        typename CONFIG_T::accum_t data_pack[data_T::size / CONFIG_T::n_filt];
+        //#pragma HLS ARRAY_PARTITION variable=data_pack complete dim=0
+
+    PixelLoop:
+        for (unsigned p = 0; p < data_T::size / CONFIG_T::n_filt; p++) {
+            data_pack[p] = in_elem[p * CONFIG_T::n_filt + c];
+        }
+        data_window[c] = reduce_global_pool<typename CONFIG_T::accum_t, data_T::size / CONFIG_T::n_filt, CONFIG_T>(
+            data_window[c], data_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height && CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        // init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+        init.template set_val<AC_VAL_MIN>();
+    }
+
+PoolInitLoop:
+    for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        data_window[i_init] = init;
+    }
+
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+        }
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+    MaxPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            //#pragma HLS PIPELINE
+
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        MaxPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+    AvgPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            //#pragma HLS PIPELINE
+
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        AvgPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = data_window[i_pack] / (CONFIG_T::in_height * CONFIG_T::in_width);
+            }
+            res.write(res_pack);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    typename CONFIG_T::accum_t data_window[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=data_window complete
+
+    typename CONFIG_T::accum_t init = 0;
+    if (CONFIG_T::pool_op == Max) {
+        // init = hls::numeric_limits<typename CONFIG_T::accum_t>::min();
+        init.template set_val<AC_VAL_MIN>();
+    }
+
+PoolInitLoop:
+    for (unsigned i_init = 0; i_init < CONFIG_T::n_filt; i_init++) {
+        data_window[i_init] = init;
+    }
+
+ReadInput:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::n_in / (data_T::size / CONFIG_T::n_filt); i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        compute_global_pool<data_T, res_T, CONFIG_T>(data.read(), data_window);
+    }
+
+    if (CONFIG_T::pool_op == Max) {
+    MaxPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            //#pragma HLS PIPELINE
+
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        MaxPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = data_window[i_pack];
+            }
+            res.write(res_pack);
+        }
+    } else {
+    AvgPoolRes:
+        for (unsigned i_res = 0; i_res < CONFIG_T::n_filt / res_T::size; i_res++) {
+            //#pragma HLS PIPELINE
+
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        AvgPoolPack:
+            for (unsigned i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = data_window[i_pack] / CONFIG_T::n_in;
+            }
+            res.write(res_pack);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h b/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h
new file mode 100755
index 0000000000..fd2019f3d5
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_recr_activations.h
@@ -0,0 +1,56 @@
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "ac_channel.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h
new file mode 100755
index 0000000000..f08d4d1050
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_recurrent.h
@@ -0,0 +1,572 @@
+
+#ifndef NNET_RECURSIVE_H_
+#define NNET_RECURSIVE_H_
+
+#include "ac_channel.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recr_activations.h"
+
+namespace nnet {
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_parts = 20;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+// Long Short term Memory NN (LSTM)
+// Resources:
+// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
+// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
+// https://en.wikipedia.org/wiki/Long_short-term_memory
+// Notes:
+//  - LSTM naming conventions adopted from the above links
+//      - s_newstate = activation(U*input + W*state)
+//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
+//  - If softmax is needed on output, perform *outside* this operations
+//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
+//  dense network at the end)
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_newstate, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        //#pragma HLS UNROLL
+        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                 res_T s_newstate[CONFIG_T::n_state],
+                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    static res_T h_state[CONFIG_T::n_state];
+    static res_T s_state[CONFIG_T::n_state];
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=h_state      complete
+    //#pragma HLS ARRAY_PARTITION variable=s_state      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    if (reset_state) {
+        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
+            //#pragma HLS UNROLL
+            s_state[i_state] = 0;
+            h_state[i_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
+                                                                                    param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+        s_newstate[iacc] = s_state[iacc];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_state, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        //#pragma HLS UNROLL
+        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    res_T h_newstate[CONFIG_T::n_state];
+    res_T s_newstate[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            //#pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                       param_br);
+        else
+            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                //#pragma HLS UNROLL
+                res[i] = h_newstate[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            //#pragma HLS UNROLL
+            res[i] = h_newstate[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(ac_channel<data_T> &data_stream, ac_channel<res_T> &res_stream,
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    typename res_T::value_type s_newstate[CONFIG_T::n_state];
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // //#pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        else
+            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                //#pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+// Struct for the GRU template
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_sequence = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
+                                                                                    // weights - refer page in copy!!
+         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    static res_T h_state[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    //#pragma HLS ARRAY_PARTITION variable=h_state         complete
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    if (reset_state) {
+        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
+            //#pragma HLS UNROLL
+            h_state[i_h_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    res_T h_state[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    //#pragma HLS ARRAY_PARTITION variable=h_state complete
+    //#pragma HLS ARRAY_PARTITION variable=data_in complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_state[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            //#pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                //#pragma HLS UNROLL
+                res[i] = h_state[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            //#pragma HLS UNROLL
+            res[i] = h_state[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(ac_channel<data_T> &data_stream, ac_channel<res_T> &res_stream,
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // //#pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
+                                                                                         param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+        //#pragma HLS DATA_PACK variable=res_pack
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                //#pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h
new file mode 100644
index 0000000000..eb5ef9f7db
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv1d_stream.h
@@ -0,0 +1,127 @@
+#ifndef NNET_SEPARABLE_CONV1D_STREAM_H_
+#define NNET_SEPARABLE_CONV1D_STREAM_H_
+
+#include "ac_channel.h"
+#include "nnet_common.h"
+#include "nnet_conv1d_stream.h"
+#include "nnet_sepconv_stream.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_encoded_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    ac_channel<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //  const int win_depth = CONFIG_T::out_width;
+    //  for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+    //      #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    //  }
+
+    //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    //#pragma HLS DATA_PACK variable=res_pack
+    unsigned outputs_ready = 0;
+
+    ac_int<CONFIG_T::filt_width, false> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_depthwise_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready,
+                                                                  weights, biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_buffer_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                                 typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                 typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency);
+    (void)ce_reuse_factor;
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        depthwise_conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        depthwise_conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_width == 1);
+
+    //#pragma HLS ARRAY_PARTITION variable=weights complete
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        if (i_iw % CONFIG_T::stride_width == 0) {
+            pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+        } else {
+            data.read();
+        }
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    //#pragma HLS DATAFLOW
+
+    ac_channel<dw_res_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    //#pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_1d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h
new file mode 100644
index 0000000000..d98dd8c315
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d.h
@@ -0,0 +1,82 @@
+#ifndef NNET_SEPARABLE_CONV2D_H_
+#define NNET_SEPARABLE_CONV2D_H_
+
+#include "nnet_common.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_chan],
+    typename CONFIG_T::weight_t depthwise_weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t depthwise_biases[CONFIG_T::n_chan]) {
+    const int in_height = CONFIG_T::in_height;
+    const int in_width = CONFIG_T::in_width;
+    const int n_chan = CONFIG_T::n_chan;
+    const int filt_height = CONFIG_T::filt_height;
+    const int filt_width = CONFIG_T::filt_width;
+    const int out_height = CONFIG_T::out_height;
+    const int out_width = CONFIG_T::out_width;
+
+    //    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor; (void)ce_reuse_factor;
+
+    //    do {
+
+    //#pragma HLS ARRAY_PARTITION variable=res complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=depthwise_biases complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=depthwise_weights complete dim=0
+    for (int h = 0; h < in_height - filt_height + 1; h++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+        for (int w = 0; w < in_width - filt_width + 1; w++) {
+            //#pragma HLS UNROLL
+            for (int c = 0; c < n_chan; c++) {
+                //#pragma HLS UNROLL
+                res_T sum = depthwise_biases[c];
+
+                // Apply the filter
+                for (int i = 0; i < filt_height; i++) {
+                    //#pragma HLS UNROLL
+                    for (int j = 0; j < filt_width; j++) {
+                        //#pragma HLS UNROLL
+                        int data_idx = (h + i) * in_width * n_chan + (w + j) * n_chan + c;
+                        int weight_idx = i * filt_width * n_chan + j * n_chan + c;
+                        sum += data[data_idx] * depthwise_weights[weight_idx];
+                    }
+                }
+
+                int res_idx = (h * out_width * n_chan) + w * n_chan + c;
+                res[res_idx] = sum;
+            }
+        }
+    }
+    //    } while (false);
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width *
+                                      CONFIG_T::depthwise_config::n_chan],
+                          res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width *
+                                    CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_height *
+                                                CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+
+    //#pragma HLS INLINE region
+
+    dw_res_T depthwise_results[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width *
+                               CONFIG_T::depthwise_config::n_chan];
+    depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_results, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_2d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_results, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h
new file mode 100644
index 0000000000..a4f7d4faa9
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv2d_stream.h
@@ -0,0 +1,152 @@
+#ifndef NNET_SEPARABLE_CONV2D_STREAM_H_
+#define NNET_SEPARABLE_CONV2D_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_stream.h"
+#include "nnet_sepconv_stream.h"
+#include "nnet_types.h"
+#include <ac_channel.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_encoded_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == CONFIG_T::filt_width);
+
+    static ac_channel<typename data_T::value_type>
+        data_window[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //  const int win_depth = CONFIG_T::filt_height * CONFIG_T::out_width;
+    //  for (unsigned i_out = 0; i_out < CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+    //      #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    //  }
+
+    // #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    // PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ac_int<CONFIG_T::filt_height * CONFIG_T::filt_width, false> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    // #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            // #pragma HLS LOOP_FLATTEN
+            // if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            //     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            // }
+            compute_scaled_indices_2d<data_T, CONFIG_T>(i_ih, i_iw, pixel_idx);
+            compute_depthwise_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready,
+                                                                      weights, biases, pixel_idx);
+        }
+    }
+}
+
+// Line Buffer Implementation (Phil's)
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_buffer_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    static ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width> line_buffer[CONFIG_T::filt_height - 1]
+                                                                                    [CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency);
+    (void)ce_reuse_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+            //#pragma HLS LOOP_FLATTEN
+            // if (CONFIG_T::strategy == nnet::latency) {
+            //     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            // }
+            if (CONFIG_T::filt_height > 1) {
+                compute_depthwise_output_buffer_2d<data_T, res_T, CONFIG_T>(data.read(), line_buffer, res, weights, biases);
+            } else {
+                compute_depthwise_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(
+    ac_channel<data_T> &data, ac_channel<res_T> &res,
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    // #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        depthwise_conv_2d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        depthwise_conv_2d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::filt_height == 1 && CONFIG_T::filt_width == 1);
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete
+    // #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    constexpr int ce_reuse_factor =
+        CONFIG_T::reuse_factor * (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1);
+    (void)ce_reuse_factor;
+ReadInputHeight:
+    for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
+    ReadInputWidth:
+        for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+            if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+                // #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            }
+            if (i_ih % CONFIG_T::stride_height == 0 && i_iw % CONFIG_T::stride_width == 0) {
+                pointwise_mult_buffer<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+            } else {
+                data.read();
+            }
+        }
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(ac_channel<data_T> &data, ac_channel<res_T> &res,
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_height *
+                                                CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    // #pragma HLS DATAFLOW
+
+    static ac_channel<dw_res_T> depthwise_res;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    // #pragma HLS STREAM variable=depthwise_res depth=res_depth
+
+    depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_2d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h
new file mode 100644
index 0000000000..753d260a77
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_sepconv_stream.h
@@ -0,0 +1,315 @@
+#ifndef NNET_SEPARABLE_CONV_STREAM_H_
+#define NNET_SEPARABLE_CONV_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+#include <ac_assert.h>
+#include <ac_channel.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    // #pragma HLS INLINE
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    // #pragma HLS function_instantiate variable=weights
+
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+
+    // Add dummy loop to which the pipeline pragma can be applied
+    do {
+
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+        //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+    // Do the matrix-multiply
+    Product:
+        for (int ii = 0; ii < CONFIG_T::kernel_size * CONFIG_T::n_chan; ii++) {
+            // #pragma HLS UNROLL
+            mult[ii] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                data[ii], weights[ii]);
+        }
+
+    // Initialize accumulator with input biases
+    ResetAccum:
+        for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
+            //#pragma HLS UNROLL
+            acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+        }
+
+    // Accumulate multiplication result
+    Accum1:
+        for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) {
+        Accum2:
+            for (int jj = 0; jj < CONFIG_T::n_chan; jj++) {
+                int index = ii * CONFIG_T::n_chan + jj;
+                acc[jj] += mult[index];
+            }
+        }
+
+    // Cast to "res_t" type
+    Result:
+        for (int ires = 0; ires < CONFIG_T::n_chan; ires++) {
+            //#pragma HLS UNROLL
+            res[ires] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ires]);
+        }
+    } while (0);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_mult_buffer(ac_channel<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                           res_T &res_pack, ac_channel<res_T> &res_stream, unsigned &outputs_ready,
+                           typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                           typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    //#pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::kernel_size * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=data complete
+    typename res_T::value_type res[CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+InitData:
+    for (int id = 0; id < CONFIG_T::kernel_size * CONFIG_T::n_chan; id++) {
+        //#pragma HLS UNROLL
+        data[id] = data_window[id].read();
+    }
+
+    //#pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_chan; jj++) {
+        //#pragma HLS UNROLL
+        if (res_T::size / CONFIG_T::n_chan == 1) {
+            res_pack[jj] = res[jj];
+        } else {
+            res_pack[outputs_ready * CONFIG_T::n_chan + jj] = res[jj];
+        }
+    }
+
+    if (res_T::size / CONFIG_T::n_chan == 1) {
+        res_stream.write(res_pack);
+    } else {
+        if (outputs_ready == (res_T::size / CONFIG_T::n_chan) - 1) {
+            res_stream.write(res_pack);
+            outputs_ready = 0;
+        } else {
+            outputs_ready++;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_encoded(
+    const data_T &in_elem, ac_channel<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+    ac_channel<res_T> &res, res_T &res_pack, unsigned &outputs_ready,
+    typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan], ac_int<CONFIG_T::kernel_size, false> *pixel_idx) {
+    //#pragma HLS INLINE
+
+    constexpr int ce_reuse_factor = CONFIG_T::reuse_factor;
+    (void)ce_reuse_factor;
+MultLoop:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    CopyDataFilt:
+        for (unsigned f = 0; f < CONFIG_T::kernel_size; f++) {
+        //#pragma HLS UNROLL
+        CopyDataChan:
+            for (unsigned c = 0; c < CONFIG_T::n_chan; c++) {
+                //#pragma HLS UNROLL
+                if (pixel_idx[p][f])
+                    data_window[f * CONFIG_T::n_chan + c].write(in_elem[p * CONFIG_T::n_chan + c]);
+            }
+        }
+        if (pixel_idx[p][CONFIG_T::kernel_size - 1]) {
+            depthwise_mult_buffer<data_T, res_T, CONFIG_T>(data_window, res_pack, res, outputs_ready, weights, biases);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_mult_buffer(const data_T &data_pack, ac_channel<res_T> &res_stream,
+                           typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                           typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE
+
+    typename data_T::value_type data[CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=data complete
+
+    typename res_T::value_type res[CONFIG_T::n_filt];
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    res_T res_pack;
+    // PRAGMA_DATA_PACK(res_pack)
+
+InitData:
+    for (int id = 0; id < CONFIG_T::n_chan; id++) {
+        //#pragma HLS UNROLL
+        data[id] = data_pack[id];
+    }
+
+    //#pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    } else {
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
+            data, res, weights, biases);
+    }
+
+CastLoop:
+    for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
+        //#pragma HLS UNROLL
+        res_pack[jj] = res[jj];
+    }
+
+    res_stream.write(res_pack);
+}
+
+// Line Buffer Implementation (Phil's)
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_buffer_1d(const data_T &in_elem, ac_channel<res_T> &res_stream,
+                                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    //#pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+
+    // Counters
+    static int pX = 0;
+    static int sX = 0;
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    // PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(in_elem, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
+        // Dense multiply
+        //#pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
+        } else {
+            assert("Resource strategy for DepthwiseConv1D is not supported." && false);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            //#pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Pointer Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+    } else {
+        pX = pX + 1;
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void compute_depthwise_output_buffer_2d(const data_T &in_elem,
+                                        ap_shift_reg<typename data_T::value_type, CONFIG_T::in_width>
+                                            line_buffer[MAX(CONFIG_T::filt_height - 1, 1)][CONFIG_T::n_chan],
+                                        ac_channel<res_T> &res_stream,
+                                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
+                                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    //#pragma HLS INLINE
+
+    // Thresholds
+    const static int lShiftX = CONFIG_T::filt_width - 1;
+    const static int lShiftY = CONFIG_T::filt_height - 1;
+
+    // counters
+    static int pX = 0; // pixel X
+    static int pY = 0; // pixel Y
+
+    static int sX = 0; // stride X
+    static int sY = 0; // stride Y
+
+    static typename data_T::value_type kernel_data[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=kernel_data complete
+
+    typename res_T::value_type res_out[CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=res_out complete dim = 0
+
+    res_T res_pack;
+    // PRAGMA_DATA_PACK(res_pack)
+
+    // Add pixel to buffer
+    nnet::shift_line_buffer<data_T, CONFIG_T>(in_elem, line_buffer, kernel_data);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && (sY - lShiftY) == 0 && pY > lShiftY - 1 && pX > lShiftX - 1) {
+        // Dense multiply
+        //#pragma HLS INLINE recursive
+        if (CONFIG_T::strategy == nnet::latency) {
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+                                                                                                 weights, biases);
+        } else {
+            assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+        }
+
+    // Pack output
+    CastLoop:
+        for (unsigned i_ic = 0; i_ic < CONFIG_T::n_filt; i_ic++) {
+            //#pragma HLS UNROLL
+            res_pack[i_ic] = res_out[i_ic];
+        }
+
+        // Write output to stream when output ready
+        res_stream.write(res_pack);
+    }
+
+    // Pointer Housekeeping
+    if (pX + 1 == CONFIG_T::in_width) // Includes padding, end of line (padded)
+    {
+        pX = 0;
+        sX = 0;
+        if (pY + 1 == CONFIG_T::in_height) { // Reached bottom of image
+            pY = 0;
+            sY = 0;
+        } else {
+            pY = pY + 1;
+            sY = ((sY - lShiftY) == 0) ? sY - CONFIG_T::stride_height + 1 : sY + 1;
+        }
+    } else {
+        pX = pX + 1;
+        sX = ((sX - lShiftX) == 0) ? sX - CONFIG_T::stride_width + 1 : sX + 1;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
new file mode 100644
index 0000000000..c76bfba5a6
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
@@ -0,0 +1,156 @@
+
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include "ac_channel.h"
+
+namespace nnet {
+
+struct broadcast_config {
+    static const unsigned in_height = 1;
+    static const unsigned in_width = 1;
+    static const unsigned in_chan = 3;
+    static const unsigned out_height = 2;
+    static const unsigned out_width = 2;
+    static const unsigned out_chan = 3;
+};
+
+template <class data_T, class res_T, int N>
+void clone_stream(ac_channel<data_T> &data, ac_channel<res_T> &res1, ac_channel<res_T> &res2) {
+// CloneLoop: for (int i = 0; i < N / data_T::size; i++) {
+//#pragma HLS PIPELINE
+#ifndef __SYNTHESIS__
+    while (data.available(1))
+#endif
+    {
+        data_T in_data = data.read();
+        res_T out_data;
+        // res_T out_data2;
+        //#pragma HLS DATA_PACK variable=out_data1
+        //#pragma HLS DATA_PACK variable=out_data2
+
+    ClonePack:
+        for (int j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = in_data[j];
+            // out_data2[j] = in_data[j];
+        }
+
+        res1.write(out_data);
+        res2.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, int N> void repack_stream(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    if (data_T::size == res_T::size) {
+        for (int i = 0; i < N / data_T::size; i++) {
+            //#pragma HLS PIPELINE
+
+            data_T in_data = data.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+            for (int j = 0; j < data_T::size; j++) {
+                //#pragma HLS UNROLL
+                out_data[j] = in_data[j];
+            }
+
+            res.write(out_data);
+        }
+    } else if (data_T::size > res_T::size) {
+        constexpr unsigned pack_diff = data_T::size / res_T::size;
+        for (int i = 0; i < N / data_T::size; i++) {
+            if (N / data_T::size > 1) {
+                //#pragma HLS PIPELINE
+            }
+
+            data_T in_data = data.read();
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+
+            for (int j = 0; j < pack_diff; j++) {
+                //#pragma HLS PIPELINE
+
+                res_T out_data;
+                for (int k = 0; k < res_T::size; k++) {
+                    //#pragma HLS UNROLL
+                    out_data[k] = in_data[j * res_T::size + k];
+                }
+                res.write(out_data);
+            }
+        }
+    } else { // data_T::size < res_T::size
+        res_T out_data;
+        constexpr unsigned pack_diff = res_T::size / data_T::size;
+        unsigned pack_cnt = 0;
+        for (int i = 0; i < N / data_T::size; i++) {
+            //#pragma HLS PIPELINE
+
+            data_T in_data = data.read();
+            for (int j = 0; j < data_T::size; j++) {
+                //#pragma HLS UNROLL
+                out_data[pack_cnt * data_T::size + j] = in_data[j];
+            }
+
+            if (pack_cnt == pack_diff - 1) {
+                res.write(out_data);
+                pack_cnt = 0;
+            } else {
+                pack_cnt++;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_1x1xC(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan);
+    int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) /
+                 (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan);
+BroadcastLoop:
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+        data_T in_data = data.read();
+        for (int j = 0; j < n_dupl; j++) {
+            //#pragma HLS PIPELINE
+            res_T out_data;
+            //#pragma HLS DATA_PACK variable=out_data
+            for (int k = 0; k < res_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data[k];
+            }
+            res.write(out_data);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_HxWx1(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+           CONFIG_T::in_width == CONFIG_T::out_width);
+BroadcastLoop:
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+        data_T in_data = data.read();
+        res_T out_data;
+        //#pragma HLS DATA_PACK variable=out_data
+        for (int k = 0; k < res_T::size; k++) {
+            //#pragma HLS UNROLL
+            out_data[k] = in_data[0];
+        }
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream(ac_channel<data_T> &data, ac_channel<res_T> &res) {
+    if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) {
+        broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
+    } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+               CONFIG_T::in_width == CONFIG_T::out_width) {
+        broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
+    }
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_types.h b/hls4ml/templates/catapult/nnet_utils/nnet_types.h
new file mode 100644
index 0000000000..d761891fdc
--- /dev/null
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_types.h
@@ -0,0 +1,64 @@
+#ifndef NNET_TYPES_H_
+#define NNET_TYPES_H_
+
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+
+namespace nnet {
+
+// Fixed-size array
+template <typename T, unsigned N> struct array {
+    typedef T value_type;
+    static const unsigned size = N;
+
+    T data[N];
+
+    T &operator[](size_t pos) { return data[pos]; }
+
+    const T &operator[](size_t pos) const { return data[pos]; }
+
+    array &operator=(const array &other) {
+        if (&other == this)
+            return *this;
+
+        assert(N == other.size && "Array sizes must match.");
+
+        for (unsigned i = 0; i < N; i++) {
+            //#pragma HLS UNROLL
+            data[i] = other[i];
+        }
+        return *this;
+    }
+};
+
+// Generic lookup-table implementation, for use in approximations of math functions
+template <typename T, unsigned N, T (*func)(T)> class lookup_table {
+  public:
+    lookup_table(T from, T to) : range_start(from), range_end(to), base_div(ac_int<16, false>(N) / T(to - from)) {
+        T step = (range_end - range_start) / ac_int<16, false>(N);
+        for (size_t i = 0; i < N; i++) {
+            T num = range_start + ac_int<16, false>(i) * step;
+            T sample = func(num);
+            samples[i] = sample;
+        }
+    }
+
+    T operator()(T n) const {
+        int index = (n - range_start) * base_div;
+        if (index < 0)
+            index = 0;
+        else if (index > N - 1)
+            index = N - 1;
+        return samples[index];
+    }
+
+  private:
+    T samples[N];
+    const T range_start, range_end;
+    ac_fixed<20, 16, true> base_div;
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado_accelerator/build_lib.sh b/hls4ml/templates/vivado_accelerator/build_lib.sh
old mode 100644
new mode 100755
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 759a7115b1..c49b23f58c 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,3 +1,4 @@
+from hls4ml.writer.catapult_writer import CatapultWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
 from hls4ml.writer.vitis_writer import VitisWriter
@@ -11,4 +12,5 @@
 register_writer('Vitis', VitisWriter)
 register_writer('VitisAccelerator', VitisAcceleratorWriter)
 register_writer('Quartus', QuartusWriter)
+register_writer('Catapult', CatapultWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
new file mode 100755
index 0000000000..48d44e4a59
--- /dev/null
+++ b/hls4ml/writer/catapult_writer.py
@@ -0,0 +1,929 @@
+import glob
+import os
+import tarfile
+from collections import OrderedDict
+from shutil import copyfile, copytree, rmtree
+
+import numpy as np
+import yaml
+
+from hls4ml.backends import get_backend
+from hls4ml.writer.writers import Writer
+
+config_filename = 'hls4ml_config.yml'
+
+
+class CatapultWriter(Writer):
+    def print_array_to_cpp(self, var, odir, write_txt_file=True):
+        """Write a weights array to C++ header files.
+
+        Args:
+            var (WeightVariable): Weight to write
+            odir (str): Output directory
+            write_txt_file (bool, optional): Write txt files in addition to .h files. Defaults to True.
+        """
+
+        h_file = open(f"{odir}/firmware/weights/{var.name}.h", "w")
+        if write_txt_file:
+            txt_file = open(f"{odir}/firmware/weights/{var.name}.txt", "w")
+
+        # meta data
+        h_file.write(f"//Numpy array shape {var.shape}\n")
+        h_file.write(f"//Min {np.min(var.min):.12f}\n")
+        h_file.write(f"//Max {np.max(var.max):.12f}\n")
+        h_file.write(f"//Number of zeros {var.nzeros}\n")
+        h_file.write("\n")
+
+        h_file.write(f"#ifndef {var.name.upper()}_H_\n")
+        h_file.write(f"#define {var.name.upper()}_H_\n")
+        h_file.write("\n")
+
+        if write_txt_file:
+            h_file.write("#ifndef __SYNTHESIS__\n")
+            h_file.write("// global extern pointer only - actual array allocated in myproject_test.cpp\n")
+            h_file.write("extern " + var.definition_cpp() + ";\n")
+            h_file.write("#else\n")
+
+        h_file.write(var.definition_cpp() + " = {")
+
+        # fill c++ array.
+        # not including internal brackets for multidimensional case
+        sep = ''
+        for x in var:
+            h_file.write(sep + x)
+            if write_txt_file:
+                txt_file.write(sep + x)
+            sep = ", "
+        h_file.write("};\n")
+        if write_txt_file:
+            h_file.write("#endif\n")
+            txt_file.close()
+        h_file.write("\n#endif\n")
+        h_file.close()
+
+    def write_output_dir(self, model):
+        """Write the base output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        if not os.path.isdir(f"{model.config.get_output_dir()}/firmware/weights"):
+            os.makedirs(f"{model.config.get_output_dir()}/firmware/weights")
+
+    @staticmethod
+    def _make_array_pragma(variable, model):
+        """
+        Layers in hls_model.py can specify output array partitioning through the `pragma` attribute.
+        If `pragma` is a string: options are 'partition', 'reshape', or 'stream'.
+        If `pragma` is a tuple: (mode, type, factor) where mode is 'partition' or 'reshape', type is
+        'complete', 'cyclic', or 'block', and factor is an integer only used when the type is not 'complete'.
+        """
+
+        config = variable.pragma
+        if type(config) is tuple:
+            mode = config[0]
+            if mode in ['partition', 'reshape']:
+                typ = config[1]
+                if typ != 'complete':
+                    factor = config[2]
+            elif mode == 'stream':
+                depth = config[1]
+        else:
+            mode = config
+            typ = 'complete'
+            factor = 0
+
+        if mode in ['partition', 'reshape']:
+            if typ == 'complete':
+                template = '// #pragma HLS ARRAY_{mode} variable={name} {type} dim={dim}'
+            else:
+                template = '// #pragma HLS ARRAY_{mode} variable={name} {type} factor={factor} dim={dim}'
+
+            return template.format(mode=mode.upper(), name=variable.name, type=typ, factor=factor, dim=0)
+
+        elif mode == 'stream':
+            fifo = model.config.get_config_value("FIFO")
+            if fifo is not None:
+                retstr = f'#pragma hls_resource {variable.name}:cns variables="{variable.name}"'
+                retstr += f' map_to_module="{fifo}" // depth="{depth}"'
+                return retstr
+            else:
+                return ''
+        else:
+            return ''
+
+    @staticmethod
+    def _make_array_fifo_pragma(variable, model):
+        config = variable.pragma
+        factor = ''
+        if type(config) is tuple:
+            mode = config[0]
+            if mode in ['partition', 'reshape']:
+                typ = config[1]
+                if typ != 'complete':
+                    factor = config[2]
+            elif mode == 'stream':
+                depth = config[1]
+        else:
+            mode = config
+            typ = 'complete'
+            factor = 0
+
+        if mode == 'stream':
+            fifo = model.config.get_config_value("FIFO")
+            if fifo is not None:
+                return f'// #pragma hls_fifo_depth {depth} {factor}'
+            else:
+                return ''
+        else:
+            return ''
+
+    def write_project_cpp(self, model):
+        """Write the main architecture source file (myproject.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        fout = open(f'{model.config.get_output_dir()}/firmware/layer_summary.txt', 'w')
+        outstr = ""
+        outstr = outstr + "{}".format("Layer Name").ljust(25)
+        outstr = outstr + "  {}".format("Layer Class").ljust(20)
+        outstr = outstr + "  {}".format("Input Type").ljust(40)
+        outstr = outstr + "  {}".format("Input Shape").ljust(15)
+        outstr = outstr + "  {}".format("Output Type").ljust(40)
+        outstr = outstr + "  {}".format("Output Shape").ljust(15)
+        # outstr = outstr + "  {}".format("Weight Type").ljust(24)
+        # outstr = outstr + "  {}".format("Bias Type").ljust(24)
+        outstr = outstr + "  {}".format("Filter Shape").ljust(15)
+        outstr = outstr + "  {}".format("Stride").ljust(10)
+        outstr = outstr + "  {}".format("IOType").ljust(15)
+        outstr = outstr + "  {}".format("Reuse").ljust(10)
+
+        fout.write(outstr + "\n")
+        input_shape = ""
+        input_datatype = ""
+        for layer in model.get_layers():
+            datatype = layer.get_output_variable().type.precision.definition_cpp() + " "
+            shape = ""
+            # layer.get_output_variable().type.precision.width
+            # layer.get_output_variable().type.precision.integer
+            # layer.get_output_variable().type.precision.sign
+            for _k, v in layer.get_output_variable().get_shape():
+                shape = shape + "[" + str(v) + "]"
+
+            if layer.attributes.layer.class_name != 'Input':
+                my_class_name = layer.class_name
+                if layer.attributes.layer.class_name == 'Activation':
+                    my_class_name = layer.get_attr('activation')
+
+                # filter_datatype = ""
+                # print(layer.weights.__dir__())
+                # layer_precision = layer.get_layer_precision()
+                # for wname, weights in layer.weights.items():
+                #    print(wname)
+                #    print(weights.type.name)
+                #    print(weights.type.precision.definition_cpp())
+                #    #print(weights.type.precision.__dir__())
+                #    print(weights.type.precision.width)
+                #    if 'ACFixed' in weights.type.precision.__class__:
+                #        print(weights.type.precision.integer)
+                #        print(weights.type.precision.signed)
+                #    print(weights.data_length)
+
+                filter = ""
+                filt_width = layer.get_attr('filt_width')
+                filt_height = layer.get_attr('filt_height')
+                if filt_width is not None:
+                    filter = "[" + str(filt_width) + "]"
+                if filt_height is not None:
+                    filter = filter + "[" + str(filt_height) + "]"
+
+                stride = ""
+                stride_width = layer.get_attr('stride_width')
+                if stride_width is not None:
+                    stride = str(stride_width)
+
+                outstr = ""
+                outstr = outstr + f"{layer.name}".ljust(25)
+                outstr = outstr + f"  {my_class_name}".ljust(20)
+                outstr = outstr + f"  {input_datatype}".ljust(40)
+                outstr = outstr + f"  {input_shape}".ljust(15)
+                outstr = outstr + f"  {datatype}".ljust(40)
+                outstr = outstr + f"  {shape}".ljust(15)
+                # outstr = outstr + "  {}".format("weight type").ljust(24)
+                # outstr = outstr + "  {}".format("bias type").ljust(24)
+                outstr = outstr + f"  {filter}".ljust(15)
+                outstr = outstr + f"  {stride}".ljust(10)
+                outstr = outstr + "  {}".format(layer.model.config.get_config_value('IOType')).ljust(15)
+                outstr = outstr + f"  {str(layer.model.config.get_reuse_factor(layer))}".ljust(10)
+                fout.write(outstr + "\n")
+
+            input_shape = shape
+            input_datatype = datatype
+
+        fout.close()
+
+        f = open(os.path.join(filedir, '../templates/catapult/firmware/myproject.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}.cpp', 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in f.readlines():
+            # Add headers to weights and biases
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert header' in line:
+                inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
+                outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
+                brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str
+                if len(model_brams) > 0:
+                    newline += ',\n' + brams_str
+                newline += '\n'
+
+            elif '// hls-fpga-machine-learning insert load weights' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for w in layer.get_weights():
+                        if w.weight_class == 'CompressedWeightVariable':
+                            newline += indent + '    nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.nonzeros, w.name, w.name
+                            )
+                        elif w.weight_class == 'ExponentWeightVariable':
+                            newline += indent + '    nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.data_length, w.name, w.name
+                            )
+                        else:
+                            newline += indent + '    nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.data_length, w.name, w.name
+                            )
+
+            # Add Interface Synthesis resource pragmas
+            elif '// hls-fpga-machine-learning insert IFSynPragmas' in line:
+                newline = line
+                all_inputs = [i.name for i in model_inputs]
+                all_outputs = [o.name for o in model_outputs]
+                all_brams = [b.name for b in model_brams]
+                io_type = model.config.get_config_value("IOType")
+
+                if io_type == 'io_serial' or io_type == 'io_stream':
+                    # Eventually this will be amba.ccs_axi4stream_in and amba.ccs_axi4stream_out
+                    for dut_input in all_inputs:
+                        newline += f'#pragma hls_resource {dut_input}:rsc variables="{dut_input}"'
+                        newline += ' map_to_module="ccs_ioport.ccs_in_wait"\n'
+                    for dut_output in all_outputs:
+                        newline += f'#pragma hls_resource {dut_output}:rsc variables="{dut_output}"'
+                        newline += ' map_to_module="ccs_ioport.ccs_out_wait"\n'
+
+            # Add input/output type
+            elif '// hls-fpga-machine-learning insert IO' in line:
+                newline = line
+                all_inputs = [i.name for i in model_inputs]
+                all_outputs = [o.name for o in model_outputs]
+                all_brams = [b.name for b in model_brams]
+                io_type = model.config.get_config_value("IOType")
+
+                if io_type == 'io_parallel':
+                    for i in model_inputs:
+                        newline += indent + self._make_array_pragma(i, model) + '\n'
+                    for o in model_outputs:
+                        newline += indent + self._make_array_pragma(o, model) + '\n'
+                    # TODO discussed adding a handle for setting the interface mode for individual input and output arrays
+                    # Probably the handle doesn't need to be exposed to the user but should be just set in hls_model.py
+                    newline += indent + '// #pragma HLS INTERFACE ap_vld port={},{} \n'.format(
+                        ','.join(all_inputs), ','.join(all_outputs)
+                    )
+                    if model.config.model_strategy.lower() == 'dataflow':
+                        newline += indent + '// #pragma HLS DATAFLOW \n'
+                    else:
+                        newline += indent + '// #pragma HLS PIPELINE \n'
+                if io_type == 'io_stream':
+                    newline += indent + '// #pragma HLS INTERFACE axis port={},{} \n'.format(
+                        ','.join(all_inputs), ','.join(all_outputs)
+                    )
+                    if all_brams:
+                        newline += indent + '// #pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams))
+                    newline += indent + '// #pragma HLS DATAFLOW \n'
+
+            elif '// hls-fpga-machine-learning insert layers' in line:
+                io_type = model.config.get_config_value("IOType")
+                newline = line + '\n'
+                for layer in model.get_layers():
+                    vars = layer.get_variables()
+                    for var in vars:
+                        if var not in model_inputs and var not in model_outputs:
+                            def_cpp = var.definition_cpp()
+                            if def_cpp is not None:
+                                if var.pragma:
+                                    newline += '    ' + self._make_array_fifo_pragma(var, model) + '\n'
+                                if io_type == 'io_serial' or io_type == 'io_stream':
+                                    newline += '    static ' + def_cpp + '; \n'
+                                else:
+                                    newline += '    ' + def_cpp + '; \n'
+                                if var.pragma:
+                                    newline += '    ' + self._make_array_pragma(var, model) + '\n'
+                    func = layer.get_attr('function_cpp', None)
+                    if func:
+                        if not isinstance(func, (list, set)):
+                            func = [func]
+                        if len(func) == 1:
+                            newline += '    ' + func[0] + ' // ' + layer.name + '\n'
+                        else:
+                            newline += '    // ' + layer.name + '\n'
+                            for line in func:
+                                newline += '    ' + line + '\n'
+                        if model.config.trace_output and layer.get_attr('trace', False):
+                            newline += '#ifndef __SYNTHESIS__\n'
+                            for var in vars:
+                                newline += '    nnet::save_layer_output<{}>({}, "{}", {});\n'.format(
+                                    var.type.name, var.name, layer.name, var.size_cpp()
+                                )
+                            newline += '#endif\n'
+                        newline += '\n'
+
+            # Just copy line
+            else:
+                newline = line
+
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+
+    def write_project_header(self, model):
+        """Write the main architecture header file (myproject.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/catapult/firmware/myproject.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}.h', 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in f.readlines():
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert header' in line:
+                inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
+                outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
+                brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str
+                if len(model_brams) > 0:
+                    newline += ',\n' + brams_str
+                newline += '\n'
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+
+    def write_defines(self, model):
+        """Write the C++ type definitions file (defines.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/catapult/firmware/defines.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/defines.h', 'w')
+
+        for line in f.readlines():
+            # Insert numbers
+            if '// hls-fpga-machine-learning insert numbers' in line:
+                newline = line
+
+                defines_list = []
+                for layer in model.get_layers():
+                    defines = ''
+                    for k, v in layer.get_output_variable().get_shape():
+                        defines += f'#define {k} {v}\n'
+
+                    defines_list.append(defines)
+
+                newline += ''.join(defines_list)
+
+            elif '// hls-fpga-machine-learning insert layer-precision' in line:
+                newline = line
+                all_precision = OrderedDict()
+                for layer in model.get_layers():
+                    layer_precision = layer.get_layer_precision()
+                    for type_name, type_var in layer_precision.items():
+                        # Ensure that layer's types doesn't override existing types
+                        # This can happen in case of InplaceVariable types
+                        if type_name not in all_precision:
+                            all_precision[type_name] = type_var
+                for used_type in all_precision.values():
+                    newline += used_type.definition_cpp()
+
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def write_parameters(self, model):
+        """Write the C++ layer config file (parameters.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/catapult/firmware/parameters.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/parameters.h', 'w')
+
+        for line in f.readlines():
+            if '// hls-fpga-machine-learning insert includes' in line:
+                newline = line
+                for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
+                    newline += '#include "%s"\n' % include
+
+            elif '// hls-fpga-machine-learning insert weights' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for w in layer.get_weights():
+                        if w.storage.lower() != 'bram':
+                            newline += f'#include "weights/{w.name}.h"\n'
+
+            elif "// hls-fpga-machine-learning insert layer-config" in line:
+                newline = line
+                for layer in model.get_layers():
+                    config = layer.get_attr('config_cpp', None)
+                    if config:
+                        newline += '// ' + layer.name + '\n'
+                        newline += config + '\n'
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def write_weights(self, model):
+        """Write the weights into header files
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        for layer in model.get_layers():
+            for weights in layer.get_weights():
+                self.print_array_to_cpp(weights, model.config.get_output_dir())
+
+    def __make_dat_file(self, original_path, project_path):
+        """
+        Convert other input/output data types into a dat file, which is
+        a text file with the falttened matrix printed out. Note that ' ' is
+        assumed to be the delimiter.
+        """
+
+        # Take in data from current supported data files
+        if original_path[-3:] == "npy":
+            data = np.load(original_path)
+        else:
+            raise Exception("Unsupported input/output data files.")
+
+        # Faltten data, just keep first dimension
+        data = data.reshape(data.shape[0], -1)
+
+        def print_data(f):
+            for i in range(data.shape[0]):
+                for j in range(data.shape[1]):
+                    f.write(str(data[i][j]) + " ")
+                f.write("\n")
+
+        # Print out in dat file
+        with open(project_path, "w") as f:
+            print_data(f)
+
+    def write_test_bench(self, model):
+        """Write the testbench files (myproject_test.cpp and input/output .dat files)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'):
+            os.mkdir(f'{model.config.get_output_dir()}/tb_data/')
+
+        input_data = model.config.get_config_value('InputData')
+        output_predictions = model.config.get_config_value('OutputPredictions')
+
+        if input_data:
+            if input_data[-3:] == "dat":
+                copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+            else:
+                self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat')
+
+        if output_predictions:
+            if output_predictions[-3:] == "dat":
+                copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat')
+            else:
+                self.__make_dat_file(
+                    output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat'
+                )
+
+        f = open(os.path.join(filedir, '../templates/catapult/myproject_test.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp', 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        for line in f.readlines():
+            indent = ' ' * (len(line) - len(line.lstrip(' ')))
+
+            # Insert numbers
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+            elif '// hls-fpga-machine-learning insert declare weights' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for w in layer.get_weights():
+                        newline += w.definition_cpp() + ";\n"
+
+            elif '// hls-fpga-machine-learning insert load weights' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for w in layer.get_weights():
+                        if w.weight_class == 'CompressedWeightVariable':
+                            newline += indent + '    nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.nonzeros, w.name, w.name
+                            )
+                        elif w.weight_class == 'ExponentWeightVariable':
+                            newline += indent + '    nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.data_length, w.name, w.name
+                            )
+                        else:
+                            newline += indent + '    nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                w.type.name, w.data_length, w.name, w.name
+                            )
+
+            elif '// hls-fpga-machine-learning insert data' in line:
+                newline = line
+                offset = 0
+                for inp in model_inputs:
+                    newline += '      ' + inp.definition_cpp() + ';\n'
+                    newline += '      nnet::copy_data<float, {}, {}, {}>(in, {});\n'.format(
+                        inp.type.name, offset, inp.size_cpp(), inp.name
+                    )
+                    offset += inp.size()
+                for out in model_outputs:
+                    newline += '      ' + out.definition_cpp() + ';\n'
+            elif '// hls-fpga-machine-learning insert random' in line:
+                newline = line
+                for inp in model_inputs:
+                    newline += '    ' + inp.definition_cpp() + ';\n'
+                    newline += f'    nnet::fill_random<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n'
+                for out in model_outputs:
+                    newline += '    ' + out.definition_cpp() + ';\n'
+            elif '// hls-fpga-machine-learning insert zero' in line:
+                newline = line
+                for inp in model_inputs:
+                    newline += '    ' + inp.definition_cpp() + ';\n'
+                    newline += f'    nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n'
+                for out in model_outputs:
+                    newline += '    ' + out.definition_cpp() + ';\n'
+            elif '// hls-fpga-machine-learning insert top-level-function' in line:
+                newline = line
+
+                input_vars = ','.join([i.name for i in model_inputs])
+                output_vars = ','.join([o.name for o in model_outputs])
+                bram_vars = ','.join([b.name for b in model_brams])
+
+                # Concatenate the input, output, and bram variables. Filter out empty/null values
+                all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                top_level = indent + f'{model.config.get_project_name()}({all_vars});\n'
+
+                newline += top_level
+            elif '// hls-fpga-machine-learning insert predictions' in line:
+                newline = line
+                for out in model_outputs:
+                    newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n'
+                    newline += indent + '  std::cout << pr[i] << " ";\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'std::cout << std::endl;\n'
+            elif '// hls-fpga-machine-learning insert tb-output' in line:
+                newline = line
+                for out in model_outputs:
+                    newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format(
+                        out.type.name, out.size_cpp(), out.name
+                    )  # TODO enable this
+            elif (
+                '// hls-fpga-machine-learning insert output' in line
+                or '// hls-fpga-machine-learning insert quantized' in line
+            ):
+                newline = line
+                for out in model_outputs:
+                    newline += indent + 'nnet::print_result<{}, {}>({}, std::cout, true);\n'.format(
+                        out.type.name, out.size_cpp(), out.name
+                    )
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def write_bridge(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/catapult/myproject_bridge.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp', 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in f.readlines():
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+            elif 'myproject' in line:
+                newline = line.replace('myproject', format(model.config.get_project_name()))
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+            elif '// hls-fpga-machine-learning insert declare weights' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for w in layer.get_weights():
+                        newline += w.definition_cpp() + ";\n"
+            elif '// hls-fpga-machine-learning insert header' in line:
+                dtype = line.split('#', 1)[1].strip()
+                inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str + '\n'
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
+                dtype = line.split('#', 1)[1].strip()
+                newline = ''
+                for i in model_inputs:
+                    newline += indent + '{var};\n'.format(var=i.definition_cpp(name_suffix='_ap'))
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(
+                        dtype, i.type.name, i.size_cpp(), i.name, i.name
+                    )
+                newline += '\n'
+
+                for o in model_outputs:
+                    newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
+
+                newline += '\n'
+
+                input_vars = ','.join([i.name + '_ap' for i in model_inputs])
+                bram_vars = ','.join([b.name for b in model_brams])
+                output_vars = ','.join([o.name + '_ap' for o in model_outputs])
+
+                # Concatenate the input, output, and bram variables. Filter out empty/null values
+                all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                top_level = indent + f'{model.config.get_project_name()}({all_vars});\n'
+                newline += top_level
+
+                newline += '\n'
+
+                for o in model_outputs:
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
+                        o.type.name, dtype, o.size_cpp(), o.name, o.name
+                    )
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                newline = ''
+                for layer in model.get_layers():
+                    func = layer.get_attr('function_cpp', None)
+                    if func and model.config.trace_output and layer.get_attr('trace', False):
+                        vars = layer.get_variables()
+                        for var in vars:
+                            newline += (
+                                indent
+                                + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                            )
+
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+
+    def write_build_script(self, model):
+        """Write the TCL/Shell build scripts (build_prj.tcl, build_lib.sh)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        # build_prj.tcl
+        srcpath = os.path.join(filedir, '../templates/catapult/build_prj.tcl')
+        dstpath = f'{model.config.get_output_dir()}/build_prj.tcl'
+        # copyfile(srcpath, dstpath)
+        f = open(srcpath)
+        fout = open(dstpath, 'w')
+        for line in f.readlines():
+            indent = line[: len(line) - len(line.lstrip())]
+            line = line.replace('myproject', model.config.get_project_name())
+            line = line.replace('CATAPULT_DIR', model.config.get_project_dir())
+            if '#hls-fpga-machine-learning insert techlibs' in line:
+                if model.config.get_config_value('Technology') is None:
+                    if model.config.get_config_value('Part') is not None:
+                        line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part'))
+                    elif model.config.get_config_value('ASICLibs') is not None:
+                        line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs'))
+                else:
+                    if model.config.get_config_value('Technology') == 'asic':
+                        line = indent + 'setup_asic_libs {{{}}}\n'.format(model.config.get_config_value('ASICLibs'))
+                    else:
+                        line = indent + 'setup_xilinx_part {{{}}}\n'.format(model.config.get_config_value('Part'))
+            elif '#hls-fpga-machine-learning insert invoke_args' in line:
+                tb_in_file = model.config.get_config_value('InputData')
+                tb_out_file = model.config.get_config_value('OutputPredictions')
+                invoke_args = '$sfd/firmware/weights'
+                if tb_in_file is not None:
+                    invoke_args = invoke_args + f' $sfd/tb_data/{tb_in_file}'
+                if tb_out_file is not None:
+                    invoke_args = invoke_args + f' $sfd/tb_data/{tb_out_file}'
+                line = indent + f'flow package option set /SCVerify/INVOKE_ARGS "{invoke_args}"\n'
+            elif 'set hls_clock_period 5' in line:
+                line = indent + 'set hls_clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))
+            fout.write(line)
+        f.close()
+        fout.close()
+
+        # build_lib.sh
+        f = open(os.path.join(filedir, '../templates/catapult/build_lib.sh'))
+        fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
+
+        for line in f.readlines():
+            line = line.replace('myproject', model.config.get_project_name())
+            line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+            fout.write(line)
+        f.close()
+        fout.close()
+
+    def write_nnet_utils(self, model):
+        """Copy the nnet_utils, AP types headers and any custom source to the project output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # nnet_utils
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        srcpath = os.path.join(filedir, '../templates/catapult/nnet_utils/')
+        dstpath = f'{model.config.get_output_dir()}/firmware/nnet_utils/'
+
+        if not os.path.exists(dstpath):
+            os.mkdir(dstpath)
+
+        headers = [os.path.basename(h) for h in glob.glob(srcpath + '*.h')]
+
+        if model.config.get_config_value('DontCopyNNET') is not None:
+            h = 'nnet_code_gen.h'
+            copyfile(srcpath + h, dstpath + h)
+            return
+
+        for h in headers:
+            copyfile(srcpath + h, dstpath + h)
+
+        print("Copying NNET files to local firmware directory")
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        for pkg in ('ac_types', 'ac_math', 'ac_simutils'):
+            dstpath = f'{model.config.get_output_dir()}/firmware/{pkg}/'
+
+            # backward compatibility, look in root dir
+            srcpath = os.path.join(filedir, '../../' + pkg + '/')
+            if not os.path.exists(srcpath):
+                # look next in Catapult-specific templates
+                srcpath = os.path.join(filedir, '../templates/catapult/' + pkg + '/')
+
+            if os.path.exists(srcpath):
+                if os.path.exists(dstpath):
+                    rmtree(dstpath)
+                print("... copying AC " + pkg + " headers from " + srcpath)
+                copytree(srcpath, dstpath)
+            else:
+                print("... skipping copy of " + pkg + " headers - assumed to located in Catapult install tree")
+
+        # custom source
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        custom_source = get_backend('Catapult').get_custom_source()
+        for dst, srcpath in custom_source.items():
+            dstpath = f'{model.config.get_output_dir()}/firmware/{dst}'
+            copyfile(srcpath, dstpath)
+
+    def write_generated_code(self, model):
+        """Write the generated code (nnet_code_gen.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        path = f'{model.config.get_output_dir()}/firmware/nnet_utils/nnet_code_gen.h'
+        f = open(path)
+        contents = f.readlines()
+        f.close()
+        f = open(path, 'w')
+
+        for line in contents:
+            if '// hls4ml insert code' in line:
+                newline = line
+                for layer in model.get_layers():
+                    for generated_code in layer.code.values():
+                        newline += str(generated_code)
+            else:
+                newline = line
+            f.write(newline)
+        f.close()
+
+    def write_yml(self, model):
+        """Write the config to the YAML file
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        def keras_model_representer(dumper, keras_model):
+            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            keras_model.save(model_path)
+            return dumper.represent_scalar('!keras_model', model_path)
+
+        try:
+            from tensorflow.keras import Model as KerasModel
+
+            yaml.add_multi_representer(KerasModel, keras_model_representer)
+        except Exception:
+            pass
+
+        with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file:
+            yaml.dump(model.config.config, file)
+
+    def write_tar(self, model):
+        """Write the generated project as a .tar.gz archive
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        if not os.path.exists(model.config.get_output_dir() + '.tar.gz'):
+            with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive:
+                archive.add(model.config.get_output_dir(), recursive=True)
+        else:
+            print("Project .tar.gz archive already exists")
+
+    def write_hls(self, model):
+        print('Writing HLS project')
+        self.write_output_dir(model)
+        self.write_project_cpp(model)
+        self.write_project_header(model)
+        self.write_weights(model)
+        self.write_defines(model)
+        self.write_parameters(model)
+        self.write_test_bench(model)
+        self.write_bridge(model)
+        self.write_build_script(model)
+        self.write_nnet_utils(model)
+        self.write_generated_code(model)
+        self.write_yml(model)
+        self.write_tar(model)
+        print('Done')
diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 5477da933a..50e9f799f6 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -5,7 +5,8 @@
     - k8s-default
   before_script:
     - source ~/.bashrc
-    - if [ $EXAMPLEMODEL == 1 ]; then git submodule init; git submodule update; fi
+    - git submodule update --init --recursive hls4ml/templates/catapult/
+    - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi
     - conda activate hls4ml-testing
     - pip install .[testing,sr,optimization]
   script:
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index caaaed636a..5ab9481e1a 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -12,7 +12,7 @@
 # Variable 'name' is simply used as an identifier for the activation
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult', 'Quartus'])
 @pytest.mark.parametrize('shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 @pytest.mark.parametrize(
     'activation, name',
diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
index c0ef0705ae..727d2ee574 100644
--- a/test/pytest/test_batchnorm.py
+++ b/test/pytest/test_batchnorm.py
@@ -29,7 +29,7 @@ def model(request):
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('model', [True, False], indirect=True)
 def test_batchnorm(model, data, backend, io_type):
     default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
diff --git a/test/pytest/test_batchnorm_pytorch.py b/test/pytest/test_batchnorm_pytorch.py
index a7a0c80247..93cda2729c 100644
--- a/test/pytest/test_batchnorm_pytorch.py
+++ b/test/pytest/test_batchnorm_pytorch.py
@@ -21,7 +21,7 @@ def data():
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 def test_batchnorm(data, backend, io_type):
     model = nn.Sequential(
         nn.BatchNorm1d(in_shape),
diff --git a/test/pytest/test_clone_flatten.py b/test/pytest/test_clone_flatten.py
index 12f30985bf..5f631d027f 100644
--- a/test/pytest/test_clone_flatten.py
+++ b/test/pytest/test_clone_flatten.py
@@ -28,7 +28,7 @@ def keras_model():
 
 @pytest.fixture
 @pytest.mark.parametrize('io_type', ['io_stream'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(
         keras_model,
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
index ab3365f228..27b966f51d 100644
--- a/test/pytest/test_cnn_mnist.py
+++ b/test/pytest/test_cnn_mnist.py
@@ -61,7 +61,7 @@ def keras_model(mnist_data):
         ('Vitis', 'io_parallel', 'resource'),
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
+        ('Vitis', 'io_stream', 'latency'),
     ],
 )
 def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
diff --git a/test/pytest/test_conv1d.py b/test/pytest/test_conv1d.py
index 79beb01a2c..48357a42a1 100644
--- a/test/pytest/test_conv1d.py
+++ b/test/pytest/test_conv1d.py
@@ -41,6 +41,8 @@ def keras_model():
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
         ('Vitis', 'io_stream', 'resource'),
+        ('Catapult', 'io_stream', 'latency'),
+        ('Catapult', 'io_stream', 'resource'),
     ],
 )
 def hls_model(keras_model, backend, io_type, strategy):
@@ -91,6 +93,8 @@ def hls_model(keras_model, backend, io_type, strategy):
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
         ('Vitis', 'io_stream', 'resource'),
+        ('Catapult', 'io_stream', 'latency'),
+        ('Catapult', 'io_stream', 'resource'),
     ],
 )
 def test_accuracy(data, keras_model, hls_model):
diff --git a/test/pytest/test_embed.py b/test/pytest/test_embed.py
index fd8e39cdb9..a27fc45b93 100644
--- a/test/pytest/test_embed.py
+++ b/test/pytest/test_embed.py
@@ -25,7 +25,7 @@ def keras_model():
 
 
 @pytest.fixture
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def hls_model(keras_model, backend, io_type):
     hls_config = hls4ml.utils.config_from_keras_model(keras_model, default_precision='ap_fixed<16,6>', granularity='name')
@@ -39,7 +39,7 @@ def hls_model(keras_model, backend, io_type):
     return hls_model
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_embedding_accuracy(data, keras_model, hls_model):
     X = data
diff --git a/test/pytest/test_globalpooling.py b/test/pytest/test_globalpooling.py
index c402a53cdf..b99f0d8212 100644
--- a/test/pytest/test_globalpooling.py
+++ b/test/pytest/test_globalpooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -87,7 +87,7 @@ def keras_model_2d(request):
     return model, model_type, keepdims
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_keras_h5_loader.py b/test/pytest/test_keras_h5_loader.py
index b53bb3a668..0c42adee31 100644
--- a/test/pytest/test_keras_h5_loader.py
+++ b/test/pytest/test_keras_h5_loader.py
@@ -9,7 +9,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 def test_keras_h5_loader(backend):
     input_shape = (10,)
     model = tf.keras.models.Sequential(
diff --git a/test/pytest/test_keras_nested_model.py b/test/pytest/test_keras_nested_model.py
index 8c4670ad51..66fa81e2f9 100755
--- a/test/pytest/test_keras_nested_model.py
+++ b/test/pytest/test_keras_nested_model.py
@@ -127,7 +127,7 @@ def randX_20_15():
     return randX(20, 15)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_nested_model(randX_20_15, backend, io_type):
     n_in = 15
@@ -150,7 +150,7 @@ def test_nested_model(randX_20_15, backend, io_type):
     np.testing.assert_allclose(y_keras.ravel(), y_hls4ml.ravel(), rtol=1e-2, atol=0.02)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_sub_nested_model(randX_20_15, backend, io_type):
     n_in = 15
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index b7fee0a4ab..060b9877de 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -31,6 +31,8 @@
         ('Vivado', 'io_stream', 'resource'),
         ('Vitis', 'io_stream', 'latency'),
         ('Vitis', 'io_stream', 'resource'),
+        ('Catapult', 'io_stream', 'latency'),
+        ('Catapult', 'io_stream', 'resource'),
     ],
 )
 def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
@@ -87,6 +89,8 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
         ('Vivado', 'io_parallel', 'latency'),
         ('Vivado', 'io_stream', 'latency'),
         ('Vivado', 'io_stream', 'resource'),
+        ('Catapult', 'io_stream', 'latency'),
+        ('Catapult', 'io_stream', 'resource'),
     ],
 )
 def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
diff --git a/test/pytest/test_pooling.py b/test/pytest/test_pooling.py
index 1f958696d8..d7de80a5a7 100644
--- a/test/pytest/test_pooling.py
+++ b/test/pytest/test_pooling.py
@@ -32,7 +32,7 @@ def keras_model_1d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
 @pytest.mark.parametrize(
     'keras_model_1d',
     [
@@ -87,7 +87,7 @@ def keras_model_2d(request):
     return model, model_type, pads
 
 
-@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado'])
+@pytest.mark.parametrize('backend', ['Quartus', 'Vitis', 'Vivado', 'Catapult'])
 @pytest.mark.parametrize(
     'keras_model_2d',
     [
diff --git a/test/pytest/test_repack_stream.py b/test/pytest/test_repack_stream.py
index 12d44a66b7..04cc9867a9 100644
--- a/test/pytest/test_repack_stream.py
+++ b/test/pytest/test_repack_stream.py
@@ -9,7 +9,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 def test_repack_precision(backend: str):
     inp = keras.Input(shape=(3, 3), name='inp')
     out = keras.layers.Reshape((3, 3), name='reshape')(inp)
@@ -41,7 +41,7 @@ def test_repack_precision(backend: str):
     assert repack_precision.signed is True, 'Precision mismatch'
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('strategy', ['Latency', 'Resource'])
 def test_repack(backend: str, strategy: str):
     inp1 = keras.Input(shape=(4,), name='inp1')
diff --git a/test/pytest/test_reshape.py b/test/pytest/test_reshape.py
index 3c421c1474..ac277bb491 100755
--- a/test/pytest/test_reshape.py
+++ b/test/pytest/test_reshape.py
@@ -21,7 +21,7 @@ def randX_20_10():
     return randX(20, 10)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_reshape_parallel(randX_20_10, backend, io_type):
     model = tf.keras.models.Sequential(
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index a75d854283..64b72db48a 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -25,7 +25,7 @@
 @pytest.mark.parametrize('kernels', kernel_options)
 @pytest.mark.parametrize('bias', bias_options)
 @pytest.mark.parametrize('io_type', io_type_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
 def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 1ce85c5016..2fa2d94afe 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -25,7 +25,7 @@
 @pytest.mark.parametrize("kernels", kernel_options)
 @pytest.mark.parametrize("bias", bias_options)
 @pytest.mark.parametrize("io_type", io_type_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
 def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (28, 28, 3)
diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 3cab00745c..19c9042465 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -19,7 +19,7 @@ def generate_data(input_shape):
     return np.clip(d, -32, 31)
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('strategy', ['stable', 'latency', 'argmax'])
 @pytest.mark.parametrize(
     'input_bits,input_shape,table_bits,io_type',
@@ -65,7 +65,7 @@ def test_softmax(backend, strategy, generate_data, input_bits, input_shape, tabl
     assert acc_hls4ml >= 0.98
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 def test_softmax_skipped(backend, io_type):
     X = np.random.rand(100, 10)
diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py
index a23e89e7da..217865fe46 100644
--- a/test/pytest/test_softsign.py
+++ b/test/pytest/test_softsign.py
@@ -10,7 +10,7 @@
 test_root_path = Path(__file__).parent
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('input_shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 def test_softsign(backend, input_shape, io_type):
     X = np.random.rand(1000, *input_shape)
diff --git a/test/pytest/test_upsampling.py b/test/pytest/test_upsampling.py
index 8ec5cabda9..9051d582bd 100644
--- a/test/pytest/test_upsampling.py
+++ b/test/pytest/test_upsampling.py
@@ -46,7 +46,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_upsampling(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':
diff --git a/test/pytest/test_zeropadding.py b/test/pytest/test_zeropadding.py
index 962a3334a6..95f7d79a7d 100644
--- a/test/pytest/test_zeropadding.py
+++ b/test/pytest/test_zeropadding.py
@@ -50,7 +50,7 @@ def keras_model_2d():
 
 
 @pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'Catapult'])
 @pytest.mark.parametrize('model_type', ['1d', '2d'])
 def test_zeropadding(keras_model_1d, keras_model_2d, data_1d, data_2d, model_type, io_type, backend):
     if model_type == '1d':

From 2a71a8391efb4533374bb4d1eb5019c265558f98 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Apr 2024 20:06:25 +0000
Subject: [PATCH 023/103] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 24.3.0 → 24.4.0](https://github.com/psf/black/compare/24.3.0...24.4.0)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a817208398..275b349422 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 24.3.0
+  rev: 24.4.0
   hooks:
   - id: black
     language_version: python3

From 6ac964c74b45ac3c1d6da7753f1297bb4094a537 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 15 Apr 2024 20:36:48 -0700
Subject: [PATCH 024/103] fix unwanted tested file change in #956

---
 test/pytest/test_cnn_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
index 27b966f51d..ab3365f228 100644
--- a/test/pytest/test_cnn_mnist.py
+++ b/test/pytest/test_cnn_mnist.py
@@ -61,7 +61,7 @@ def keras_model(mnist_data):
         ('Vitis', 'io_parallel', 'resource'),
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'latency'),
+        ('Vitis', 'io_stream', 'resource'),
     ],
 )
 def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):

From ec95e010e2c30728f074f0210e912a2a7b94447b Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Wed, 10 Apr 2024 20:43:21 +0200
Subject: [PATCH 025/103] Fix SR backend synth missing variables

---
 hls4ml/writer/symbolic_writer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hls4ml/writer/symbolic_writer.py b/hls4ml/writer/symbolic_writer.py
index 8ab5c53806..b442d3cd39 100644
--- a/hls4ml/writer/symbolic_writer.py
+++ b/hls4ml/writer/symbolic_writer.py
@@ -68,6 +68,10 @@ def write_build_script(self, model):
         f.write('set part "{}"\n'.format(model.config.get_config_value('Part')))
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '0%')))
+        f.write('variable version\n')
+        f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
         f.close()
 
         # build_prj.tcl

From 5de1bf5cc7954ea624aad683349dc3e28c7109a7 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 16 Apr 2024 18:38:36 +0200
Subject: [PATCH 026/103] Test for SR backend config

---
 hls4ml/backends/symbolic/symbolic_backend.py | 14 ++--
 test/pytest/test_sr.py                       | 67 ++++++++++++++++++++
 2 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/symbolic/symbolic_backend.py b/hls4ml/backends/symbolic/symbolic_backend.py
index 29956f147b..bad75c2417 100644
--- a/hls4ml/backends/symbolic/symbolic_backend.py
+++ b/hls4ml/backends/symbolic/symbolic_backend.py
@@ -42,7 +42,7 @@ def create_initial_config(
         self,
         part='xcvu9p-flga2577-2-e',
         clock_period=5,
-        clock_uncertainty='12.5%',
+        clock_uncertainty=None,
         io_type='io_parallel',
         compiler='vivado_hls',
         hls_include_path=None,
@@ -50,11 +50,17 @@ def create_initial_config(
     ):
         config = {}
 
-        config['Part'] = part if part is not None else 'xcvu9p-flga2577-2-e'
-        config['ClockPeriod'] = clock_period
+        config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e'
+        config['ClockPeriod'] = clock_period if clock_period is not None else 5
         config['ClockUncertainty'] = clock_uncertainty
-        config['IOType'] = io_type
+        config['IOType'] = io_type if io_type is not None else 'io_parallel'
         config['Compiler'] = compiler if compiler is not None else 'vivado_hls'
+        if config['ClockUncertainty'] is None:
+            if config['Compiler'] == 'vivado_hls':
+                config['ClockUncertainty'] = '12.5%'
+            else:
+                config['ClockUncertainty'] = '27%'
+
         if not all([hls_include_path, hls_libs_path]):
             # Try to infer the include path from Vivado path
             bin_path = os.popen(f'command -v {compiler}').read().strip()
diff --git a/test/pytest/test_sr.py b/test/pytest/test_sr.py
index e4c922cc34..272450b658 100644
--- a/test/pytest/test_sr.py
+++ b/test/pytest/test_sr.py
@@ -69,3 +69,70 @@ def test_pysr_luts(data):
     eq = str(model.sympy())
 
     assert 'cos_lut' in eq
+
+
+@pytest.mark.parametrize('part', ['some_part', None])
+@pytest.mark.parametrize('clock_period', [8, None])
+@pytest.mark.parametrize('clock_unc', ['15%', None])
+@pytest.mark.parametrize('compiler', ['vivado_hls', 'vitis_hls'])
+def test_sr_backend_config(part, clock_period, clock_unc, compiler):
+
+    expr = 'x0**2 + 2.5382*cos_lut(x3) - 0.5'
+
+    if clock_unc is not None:
+        unc_str = clock_unc.replace('%', '')
+    else:
+        unc_str = clock_unc
+
+    compiler_str = compiler.replace('_hls', '')
+
+    test_dir = f'hls4mlprj_sr_backend_config_part_{part}_period_{clock_period}_unc_{unc_str}_{compiler_str}'
+    output_dir = test_root_path / test_dir
+
+    hls_model = hls4ml.converters.convert_from_symbolic_expression(
+        expr,
+        n_symbols=5,
+        precision='ap_fixed<18,6>',
+        output_dir=str(output_dir),
+        part=part,
+        clock_period=clock_period,
+        clock_uncertainty=clock_unc,
+        compiler=compiler,
+        hls_include_path='',
+        hls_libs_path='',
+    )
+    hls_model.write()
+
+    # Check if config was properly parsed into the ModelGraph
+
+    read_part = hls_model.config.get_config_value('Part')
+    expected_part = part if part is not None else 'xcvu13p-flga2577-2-e'
+    assert read_part == expected_part
+
+    read_clock_period = hls_model.config.get_config_value('ClockPeriod')
+    expected_period = clock_period if clock_period is not None else 5
+    assert read_clock_period == expected_period
+
+    read_clock_unc = hls_model.config.get_config_value('ClockUncertainty')
+    expected_unc = clock_unc
+    if expected_unc is None:
+        if compiler == 'vivado_hls':
+            expected_unc = '12.5%'
+        else:
+            expected_unc = '27%'
+    assert read_clock_unc == expected_unc
+
+    # Check if Writer properly wrote tcl scripts
+    part_ok = period_ok = unc_ok = False
+
+    prj_tcl_path = output_dir / 'project.tcl'
+    with open(prj_tcl_path) as f:
+        for line in f.readlines():
+            if 'set part' in line and expected_part in line:
+                part_ok = True
+            if f'set clock_period {expected_period}' in line:
+                period_ok = True
+            if f'set clock_uncertainty {expected_unc}' in line:
+                unc_ok = True
+
+    assert part_ok and period_ok and unc_ok

From a6fec3646f97b39e72a812fc47a01bb12cba9a0a Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 6 Mar 2024 20:39:47 +0100
Subject: [PATCH 027/103] Upsampling support for PyTorch models

---
 hls4ml/converters/pytorch/reshape.py   |  44 +++++++++++
 hls4ml/model/layers.py                 |  32 ++++++--
 test/pytest/test_upsampling_pytorch.py | 100 +++++++++++++++++++++++++
 3 files changed, 170 insertions(+), 6 deletions(-)
 create mode 100644 test/pytest/test_upsampling_pytorch.py

diff --git a/hls4ml/converters/pytorch/reshape.py b/hls4ml/converters/pytorch/reshape.py
index 5e5cde5261..37191135a1 100644
--- a/hls4ml/converters/pytorch/reshape.py
+++ b/hls4ml/converters/pytorch/reshape.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from hls4ml.converters.pytorch_to_hls import pytorch_handler
+from hls4ml.converters.utils import parse_data_format
 
 reshape_layers = ['View']
 
@@ -106,3 +107,46 @@ def parse_flatten_layer(operation, layer_name, input_names, input_shapes, node,
     output_shape = layer['target_shape']
 
     return layer, output_shape
+
+
+@pytorch_handler('Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d')
+def handle_upsample(operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config):
+
+    assert operation in ['Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d']
+    layer = {}
+    layer['name'] = layer_name
+    layer['inputs'] = input_names
+    layer['class_name'] = 'Resize'
+    layer['data_format'] = 'channels_first'
+
+    input_shape = parse_data_format(input_shapes[0], 'channels_first')
+    if len(input_shape) == 2:
+        layer['in_height'] = 1
+        layer['in_width'], layer['n_chan'] = input_shape
+
+        layer['out_height'] = 1
+        layer['out_width'] = int(layer['in_width'] * class_object.scale_factor)
+
+        output_shape = [input_shapes[0][0], layer['n_chan'], layer['out_width']]
+    elif len(input_shape) == 3:
+        layer['in_height'], layer['in_width'], layer['n_chan'] = input_shape
+
+        scale_factor = class_object.scale_factor
+        if isinstance(scale_factor, tuple):
+            scale_height = scale_factor[0]
+            scale_width = scale_factor[1]
+        else:
+            scale_height = scale_factor
+            scale_width = scale_factor
+
+        layer['out_height'] = int(layer['in_height'] * scale_height)
+        layer['out_width'] = int(layer['in_width'] * scale_width)
+
+        output_shape = [layer['n_chan'], layer['out_height'], layer['out_width']]
+    else:
+        raise Exception(f'Parsing "Upsample" with {len(input_shape)}-dimensional tensors is not yet supported.')
+
+    layer['algorithm'] = class_object.mode
+    layer['align_corners'] = bool(class_object.align_corners)
+
+    return layer, output_shape
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index de191baa40..0d9cc0622c 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -912,14 +912,34 @@ def initialize(self):
 
 
 class Resize(Layer):
+    _expected_attributes = [
+        Attribute('in_height'),
+        Attribute('in_width'),
+        Attribute('out_height'),
+        Attribute('out_width'),
+        Attribute('n_chan'),
+        ChoiceAttribute('algorithm', ['nearest', 'bilinear'], default='nearest'),
+        Attribute('align_corners', value_type=bool, default=False),
+    ]
+
     def initialize(self):
         inp = self.get_input_variable()
-        if len(inp.shape) == 2:  # 1D -> width + chan
-            shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
-            dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
-        elif len(inp.shape) == 3:  # 2D -> height + width + chan
-            shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
-            dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+
+        if self.get_attr('data_format') == 'channels_last':
+            if len(inp.shape) == 2:  # 1D -> width + chan
+                shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
+                dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+            elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
+                dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+        else:
+            if len(inp.shape) == 2:  # 1D -> width + chan
+                shape = [self.get_attr('n_chan'), self.get_attr('out_width')]
+                dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
+            elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')]
+                dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
+
         self.add_output_variable(shape, dims, precision=inp.type.precision)
 
 
diff --git a/test/pytest/test_upsampling_pytorch.py b/test/pytest/test_upsampling_pytorch.py
new file mode 100644
index 0000000000..4a6c69ede4
--- /dev/null
+++ b/test/pytest/test_upsampling_pytorch.py
@@ -0,0 +1,100 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+in_height = 6
+in_width = 8
+in_feat = 4
+
+size = 2
+atol = 5e-3
+
+
+@pytest.fixture(scope='module')
+def data_1d():
+    X = np.random.rand(100, in_feat, in_width)
+    return X
+
+
+@pytest.fixture(scope='module')
+def data_2d():
+    X = np.random.rand(100, in_feat, in_height, in_width)
+    return X
+
+
+class Upsample1DModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.upsample = nn.Upsample(scale_factor=2)
+
+    def forward(self, x):
+        return self.upsample(x)
+
+
+class Upsample2DModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # this scale_factor tests proper output shape calculation with fractional scaling and parsing per-axis scales
+        self.upsample = nn.UpsamplingNearest2d(scale_factor=(1, 2.4))  # Would also work with Upsample(mode='nearest')
+
+    def forward(self, x):
+        return self.upsample(x)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+def test_pytorch_upsampling1d(data_1d, io_type, backend):
+    model = Upsample1DModel()
+
+    config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        default_precision='ap_fixed<16,6>',
+        inputs_channel_last=True,  # We don't test channels_last conversion here
+        transpose_outputs=False,
+    )
+    odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_1d_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_pytorch_model(
+        model, (None, in_feat, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend
+    )
+    hls_model.compile()
+
+    data_1d_t = np.ascontiguousarray(data_1d.transpose([0, 2, 1]))
+
+    pytorch_prediction = model(torch.Tensor(data_1d)).detach().numpy()
+    hls_prediction = hls_model.predict(data_1d_t)
+
+    pred_shape = list(pytorch_prediction.shape)
+    pred_shape.append(pred_shape.pop(1))  # Transpose shape to channels_last
+    hls_prediction = hls_prediction.reshape(pred_shape).transpose([0, 2, 1])  # Transpose back
+
+    np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01)
+
+
+@pytest.mark.parametrize('io_type', ['io_parallel'])  # Fractional scaling doesn't work with io_stream
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+def test_pytorch_upsampling2d(data_2d, io_type, backend):
+    model = Upsample2DModel()
+
+    config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        default_precision='ap_fixed<16,6>',
+        inputs_channel_last=False,  # With conversion to channels_last
+        transpose_outputs=True,
+    )
+    odir = str(test_root_path / f'hls4mlprj_pytorch_upsampling_2d_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_pytorch_model(
+        model, (None, in_feat, in_height, in_width), hls_config=config, io_type=io_type, output_dir=odir, backend=backend
+    )
+    hls_model.compile()
+
+    pytorch_prediction = model(torch.Tensor(data_2d)).detach().numpy().flatten()
+    hls_prediction = hls_model.predict(data_2d).flatten()
+
+    np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=1e-2, atol=0.01)

From 1b72b19905befc84bea30a2d17ad7d28c8dc0022 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 15 Apr 2024 20:58:25 +0200
Subject: [PATCH 028/103] Split Catapult types into separate file

---
 hls4ml/backends/catapult/catapult_backend.py  |  3 +-
 hls4ml/backends/catapult/catapult_types.py    | 92 +++++++++++++++++++
 .../catapult/passes/transform_types.py        |  6 +-
 hls4ml/backends/fpga/fpga_types.py            | 65 -------------
 4 files changed, 96 insertions(+), 70 deletions(-)
 create mode 100644 hls4ml/backends/catapult/catapult_types.py

diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
index 5556154dcb..0583e80dab 100644
--- a/hls4ml/backends/catapult/catapult_backend.py
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -4,7 +4,8 @@
 import numpy as np
 
 from hls4ml.backends import FPGABackend
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter
+from hls4ml.backends.catapult.catapult_types import CatapultArrayVariableConverter
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter
 from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute
 from hls4ml.model.flow import register_flow
 from hls4ml.model.layers import (
diff --git a/hls4ml/backends/catapult/catapult_types.py b/hls4ml/backends/catapult/catapult_types.py
new file mode 100644
index 0000000000..92fbeb2db8
--- /dev/null
+++ b/hls4ml/backends/catapult/catapult_types.py
@@ -0,0 +1,92 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ArrayVariableConverter,
+    InplaceStreamVariableConverter,
+    StreamVariableConverter,
+    StructMemberVariableConverter,
+    VariableDefinition,
+)
+
+# region ArrayVariable
+
+
+class CatapultArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
+        )
+
+
+class CatapultInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class CatapultArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition)
+
+
+class CatapultInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition
+        )
+
+
+# endregion
+
+# region StructMemberVariable
+
+
+class CatapultStructMemberVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class CatapultStructMemberVariableConverter(StructMemberVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition
+        )
+
+
+# endregion
+
+# region StreamVariable
+
+
+class CatapultStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration (string name arg not implemented in ac_channel)
+            return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format(
+                type=self.type.name, name=self.name, suffix=name_suffix
+            )
+
+
+class CatapultStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition)
+
+
+# endregion
+
+# region InplaceStreamVariable
+
+
+class CatapultInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition
+        )
+
+
+# endregion
diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py
index 4ef3548cb6..3cbb917a67 100755
--- a/hls4ml/backends/catapult/passes/transform_types.py
+++ b/hls4ml/backends/catapult/passes/transform_types.py
@@ -1,12 +1,10 @@
-from hls4ml.backends.fpga.fpga_types import (
-    ACTypeConverter,
+from hls4ml.backends.catapult.catapult_types import (
     CatapultArrayVariableConverter,
     CatapultInplaceArrayVariableConverter,
     CatapultInplaceStreamVariableConverter,
     CatapultStreamVariableConverter,
-    HLSTypeConverter,
-    StaticWeightVariableConverter,
 )
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
 
diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index 408f1320e4..c5327dab8c 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -248,13 +248,6 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         )
 
 
-class CatapultArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format(
-            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
-        )
-
-
 class VivadoInplaceArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
@@ -265,11 +258,6 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
-class CatapultInplaceArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class ArrayVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -297,11 +285,6 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition)
 
 
-class CatapultArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition)
-
-
 class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
@@ -314,13 +297,6 @@ def __init__(self, type_converter):
         )
 
 
-class CatapultInplaceArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition
-        )
-
-
 # endregion
 
 # region StructMemberVariable
@@ -333,13 +309,6 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         )
 
 
-class CatapultStructMemberVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        return '{type} {name}{suffix}[{shape}]'.format(
-            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
-        )
-
-
 class StructMemberVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -369,13 +338,6 @@ def __init__(self, type_converter):
         )
 
 
-class CatapultStructMemberVariableConverter(StructMemberVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition
-        )
-
-
 # endregion
 
 # region StreamVariable
@@ -409,21 +371,6 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
-class CatapultStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        if as_reference:  # Function parameter
-            return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}'
-        else:  # Declaration (string name arg not implemented in ac_channel)
-            return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format(
-                type=self.type.name, name=self.name, suffix=name_suffix
-            )
-
-
-class CatapultInplaceStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class StreamVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -455,11 +402,6 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
 
 
-class CatapultStreamVariableConverter(StreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition)
-
-
 # endregion
 
 # region InplaceStreamVariable
@@ -493,13 +435,6 @@ def __init__(self, type_converter):
         )
 
 
-class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition
-        )
-
-
 # endregion
 
 # region WeightsVariable

From 28521d0e5148a039fad9d0fdb7656996ec84dafd Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 15 Apr 2024 21:10:04 +0200
Subject: [PATCH 029/103] Split Quartus types into separate file

---
 hls4ml/backends/fpga/fpga_types.py            | 63 -------------
 .../quartus/passes/transform_types.py         |  6 +-
 hls4ml/backends/quartus/quartus_types.py      | 90 +++++++++++++++++++
 3 files changed, 92 insertions(+), 67 deletions(-)
 create mode 100644 hls4ml/backends/quartus/quartus_types.py

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index c5327dab8c..16c029828f 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -241,23 +241,11 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         )
 
 
-class QuartusArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        return '{type} {name}{suffix}[{shape}] {pragma}'.format(
-            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
-        )
-
-
 class VivadoInplaceArrayVariableDefinition(VariableDefinition):
     def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
-class QuartusInplaceArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class ArrayVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -280,35 +268,16 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition)
 
 
-class QuartusArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition)
-
-
 class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
 
 
-class QuartusInplaceArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition
-        )
-
-
 # endregion
 
 # region StructMemberVariable
 
 
-class QuartusStructMemberVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        return '{type} {name}{suffix}[{shape}]'.format(
-            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
-        )
-
-
 class StructMemberVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -331,13 +300,6 @@ def convert(self, tensor_var, pragma='partition', struct_name=None):
         return tensor_var
 
 
-class QuartusStructMemberVariableConverter(StructMemberVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition
-        )
-
-
 # endregion
 
 # region StreamVariable
@@ -358,19 +320,6 @@ def definition_cpp(self):
         return f'auto& {self.name} = {self.input_var.name}'
 
 
-class QuartusStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        if as_reference:  # Function parameter
-            return f'stream<{self.type.name}> &{self.name}{name_suffix}'
-        else:  # Declaration
-            return f'stream<{self.type.name}> {self.name}{name_suffix}'
-
-
-class QuartusInplaceStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class StreamVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -397,11 +346,6 @@ def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)
 
 
-class QuartusStreamVariableConverter(StreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
-
-
 # endregion
 
 # region InplaceStreamVariable
@@ -428,13 +372,6 @@ def __init__(self, type_converter):
         )
 
 
-class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition
-        )
-
-
 # endregion
 
 # region WeightsVariable
diff --git a/hls4ml/backends/quartus/passes/transform_types.py b/hls4ml/backends/quartus/passes/transform_types.py
index 67de32ab65..041aad8136 100644
--- a/hls4ml/backends/quartus/passes/transform_types.py
+++ b/hls4ml/backends/quartus/passes/transform_types.py
@@ -1,12 +1,10 @@
-from hls4ml.backends.fpga.fpga_types import (
-    ACTypeConverter,
-    HLSTypeConverter,
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
+from hls4ml.backends.quartus.quartus_types import (
     QuartusArrayVariableConverter,
     QuartusInplaceArrayVariableConverter,
     QuartusInplaceStreamVariableConverter,
     QuartusStreamVariableConverter,
     QuartusStructMemberVariableConverter,
-    StaticWeightVariableConverter,
 )
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
diff --git a/hls4ml/backends/quartus/quartus_types.py b/hls4ml/backends/quartus/quartus_types.py
new file mode 100644
index 0000000000..e641c9aba7
--- /dev/null
+++ b/hls4ml/backends/quartus/quartus_types.py
@@ -0,0 +1,90 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ArrayVariableConverter,
+    InplaceStreamVariableConverter,
+    StreamVariableConverter,
+    StructMemberVariableConverter,
+    VariableDefinition,
+)
+
+# region ArrayVariable
+
+
+class QuartusArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}] {pragma}'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
+        )
+
+
+class QuartusInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class QuartusArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition)
+
+
+class QuartusInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition
+        )
+
+
+# endregion
+
+# region StructMemberVariable
+
+
+class QuartusStructMemberVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class QuartusStructMemberVariableConverter(StructMemberVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition
+        )
+
+
+# endregion
+
+# region StreamVariable
+
+
+class QuartusStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'stream<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration
+            return f'stream<{self.type.name}> {self.name}{name_suffix}'
+
+
+class QuartusInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class QuartusStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
+
+
+# endregion
+
+# region InplaceStreamVariable
+
+
+class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition
+        )
+
+
+# endregion

From a44707d30e92f855bb15716d17e1918a40d8412f Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 15 Apr 2024 21:15:40 +0200
Subject: [PATCH 030/103] Split Vivado types into separate file

---
 hls4ml/backends/fpga/fpga_types.py            | 49 -------------
 .../backends/vivado/passes/transform_types.py |  6 +-
 hls4ml/backends/vivado/vivado_backend.py      |  3 +-
 hls4ml/backends/vivado/vivado_types.py        | 70 +++++++++++++++++++
 4 files changed, 74 insertions(+), 54 deletions(-)
 create mode 100644 hls4ml/backends/vivado/vivado_types.py

diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index 16c029828f..15ad386c5a 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -234,18 +234,6 @@ def definition_cpp(self, name_suffix='', as_reference=False):
 # region ArrayVariable
 
 
-class VivadoArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        return '{type} {name}{suffix}[{shape}]'.format(
-            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp()
-        )
-
-
-class VivadoInplaceArrayVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class ArrayVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -263,16 +251,6 @@ def convert(self, tensor_var, pragma='partition'):
         return tensor_var
 
 
-class VivadoArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition)
-
-
-class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
-
-
 # endregion
 
 # region StructMemberVariable
@@ -305,21 +283,6 @@ def convert(self, tensor_var, pragma='partition', struct_name=None):
 # region StreamVariable
 
 
-class VivadoStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self, name_suffix='', as_reference=False):
-        if as_reference:  # Function parameter
-            return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}'
-        else:  # Declaration
-            return 'hls::stream<{type}> {name}{suffix}("{name}")'.format(
-                type=self.type.name, name=self.name, suffix=name_suffix
-            )
-
-
-class VivadoInplaceStreamVariableDefinition(VariableDefinition):
-    def definition_cpp(self):
-        return f'auto& {self.name} = {self.input_var.name}'
-
-
 class StreamVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -341,11 +304,6 @@ def convert(self, tensor_var, n_pack=1, depth=0):
         return tensor_var
 
 
-class VivadoStreamVariableConverter(StreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)
-
-
 # endregion
 
 # region InplaceStreamVariable
@@ -365,13 +323,6 @@ def convert(self, tensor_var, n_pack=1, depth=0):
         return tensor_var
 
 
-class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter):
-    def __init__(self, type_converter):
-        super().__init__(
-            type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition
-        )
-
-
 # endregion
 
 # region WeightsVariable
diff --git a/hls4ml/backends/vivado/passes/transform_types.py b/hls4ml/backends/vivado/passes/transform_types.py
index 3462578e74..7bff3b8efc 100644
--- a/hls4ml/backends/vivado/passes/transform_types.py
+++ b/hls4ml/backends/vivado/passes/transform_types.py
@@ -1,7 +1,5 @@
-from hls4ml.backends.fpga.fpga_types import (
-    APTypeConverter,
-    HLSTypeConverter,
-    StaticWeightVariableConverter,
+from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
+from hls4ml.backends.vivado.vivado_types import (
     VivadoArrayVariableConverter,
     VivadoInplaceArrayVariableConverter,
     VivadoInplaceStreamVariableConverter,
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 64df42bb42..6bd57d6a88 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -4,7 +4,8 @@
 import numpy as np
 
 from hls4ml.backends import FPGABackend
-from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter, VivadoArrayVariableConverter
+from hls4ml.backends.fpga.fpga_types import APTypeConverter, HLSTypeConverter
+from hls4ml.backends.vivado.vivado_types import VivadoArrayVariableConverter
 from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute
 from hls4ml.model.flow import register_flow
 from hls4ml.model.layers import (
diff --git a/hls4ml/backends/vivado/vivado_types.py b/hls4ml/backends/vivado/vivado_types.py
new file mode 100644
index 0000000000..ecac4a46a4
--- /dev/null
+++ b/hls4ml/backends/vivado/vivado_types.py
@@ -0,0 +1,70 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ArrayVariableConverter,
+    InplaceStreamVariableConverter,
+    StreamVariableConverter,
+    VariableDefinition,
+)
+
+# region ArrayVariable
+
+
+class VivadoArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class VivadoInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class VivadoArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition)
+
+
+class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
+
+
+# endregion
+
+# region StreamVariable
+
+
+class VivadoStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration
+            return 'hls::stream<{type}> {name}{suffix}("{name}")'.format(
+                type=self.type.name, name=self.name, suffix=name_suffix
+            )
+
+
+class VivadoInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class VivadoStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)
+
+
+# endregion
+
+# region InplaceStreamVariable
+
+
+class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition
+        )
+
+
+# endregion

From cefab60a29e19fdf7036d2b0fa8ed7e7cc75d27d Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:57:54 +0200
Subject: [PATCH 031/103] Increase precision of Softsign test

---
 test/pytest/test_softsign.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/pytest/test_softsign.py b/test/pytest/test_softsign.py
index 217865fe46..31a2a1c2cf 100644
--- a/test/pytest/test_softsign.py
+++ b/test/pytest/test_softsign.py
@@ -14,11 +14,15 @@
 @pytest.mark.parametrize('input_shape, io_type', [((8,), 'io_parallel'), ((8,), 'io_stream'), ((8, 8, 3), 'io_stream')])
 def test_softsign(backend, input_shape, io_type):
     X = np.random.rand(1000, *input_shape)
+    X = np.round(X * 2**10) * 2**-10
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Activation(input_shape=input_shape, activation='softsign', name='softsign'))
     model.compile()
 
-    cfg = hls4ml.utils.config_from_keras_model(model, granularity='name')
+    cfg = hls4ml.utils.config_from_keras_model(model, granularity='name', default_precision='fixed<20,4>')
+    # Since softsign implementation is lookup-based increasing the precision and size of the table helps with accuracy
+    cfg['LayerName']['softsign']['table_t'] = 'fixed<20,4>'
+    cfg['LayerName']['softsign']['table_size'] = 2048
     odir = str(test_root_path / f'hls4mlprj_softsign_{backend}_{io_type}_{str(input_shape)}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=cfg, io_type=io_type, output_dir=odir, backend=backend

From 440901b914f3cdc1d9a1aa24f22a2fba810cbd6b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:58:15 +0200
Subject: [PATCH 032/103] Use quantized input in binary CNN test

---
 test/pytest/test_binary_cnn.py | 47 +++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/test/pytest/test_binary_cnn.py b/test/pytest/test_binary_cnn.py
index 7114e47263..40af056df9 100644
--- a/test/pytest/test_binary_cnn.py
+++ b/test/pytest/test_binary_cnn.py
@@ -33,57 +33,57 @@ def test_binary_cnn(backend, io_type, strategy):
     x = QConv2D(
         4,
         (3, 3),
-        kernel_quantizer="binary",
-        name="conv2d_1",
+        kernel_quantizer='binary',
+        name='conv2d_1',
         kernel_regularizer=l2(0.0001),
         use_bias=True,
         bias_quantizer='quantized_bits(5,2)',
     )(x_in)
     x = QBatchNormalization()(x)
-    x = QActivation("binary", name="act1")(x)
+    x = QActivation('binary', name='act1')(x)
 
-    x = QConv2D(8, (3, 3), kernel_quantizer="binary", name="conv2d_2", kernel_regularizer=l2(0.0001), use_bias=False)(x)
+    x = QConv2D(8, (3, 3), kernel_quantizer='binary', name='conv2d_2', kernel_regularizer=l2(0.0001), use_bias=False)(x)
     x = QBatchNormalization()(x)
-    x = QActivation("binary", name="act2")(x)
+    x = QActivation('binary', name='act2')(x)
     x = MaxPooling2D(pool_size=(2, 2))(x)
 
-    x = QConv2D(8, (3, 3), kernel_quantizer="binary", name="conv2d_3", kernel_regularizer=l2(0.0001), use_bias=False)(x)
+    x = QConv2D(8, (3, 3), kernel_quantizer='binary', name='conv2d_3', kernel_regularizer=l2(0.0001), use_bias=False)(x)
     x = QBatchNormalization()(x)
-    x = QActivation("binary", name="act3")(x)
+    x = QActivation('binary', name='act3')(x)
     x = MaxPooling2D(pool_size=(2, 2))(x)
 
     x = Flatten()(x)
 
-    x = QDense(10, kernel_quantizer="binary", name="q_dense_6", use_bias=False)(x)
+    x = QDense(10, kernel_quantizer='binary', name='q_dense_6', use_bias=False)(x)
     x = QBatchNormalization()(x)
-    x = QActivation("binary_tanh", name="act4")(x)
+    x = QActivation('binary_tanh', name='act4')(x)
 
-    x = QDense(10, kernel_quantizer="binary", activation="linear", name="q_dense_7", use_bias=False)(x)
+    x = QDense(10, kernel_quantizer='binary', activation='linear', name='q_dense_7', use_bias=False)(x)
 
     model2 = Model(inputs=x_in, outputs=x)
 
-    model2.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
+    model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
 
     model2.summary()
 
-    hls_config = hls4ml.utils.config_from_keras_model(model2, granularity="name", default_precision='fixed<32,12>')
-    hls_config["Model"]["Strategy"] = strategy
+    hls_config = hls4ml.utils.config_from_keras_model(model2, granularity='name', default_precision='fixed<32,12>')
+    hls_config['Model']['Strategy'] = strategy
 
-    # hls_config["LayerName"]["q_dense_7_softmax"]["Implementation"] = "legacy"
+    # hls_config['LayerName']['q_dense_7_softmax']['Implementation'] = 'legacy'
 
-    hls_config["LayerName"]["conv2d_1"]["ReuseFactor"] = 9
-    hls_config["LayerName"]["conv2d_2"]["ReuseFactor"] = 36
-    hls_config["LayerName"]["conv2d_3"]["ReuseFactor"] = 72
-    hls_config["LayerName"]["q_dense_6"]["ReuseFactor"] = 2000
-    hls_config["LayerName"]["q_dense_7"]["ReuseFactor"] = 100
+    hls_config['LayerName']['conv2d_1']['ReuseFactor'] = 9
+    hls_config['LayerName']['conv2d_2']['ReuseFactor'] = 36
+    hls_config['LayerName']['conv2d_3']['ReuseFactor'] = 72
+    hls_config['LayerName']['q_dense_6']['ReuseFactor'] = 2000
+    hls_config['LayerName']['q_dense_7']['ReuseFactor'] = 100
 
     if backend == 'Quartus' and io_type == 'io_parallel':
         # Winegrad imp[lementation does not support binary
-        hls_config["LayerName"]["conv2d_1"]["Implementation"] = "im2col"
-        hls_config["LayerName"]["conv2d_2"]["Implementation"] = "im2col"
-        hls_config["LayerName"]["conv2d_3"]["Implementation"] = "im2col"
+        hls_config['LayerName']['conv2d_1']['Implementation'] = 'im2col'
+        hls_config['LayerName']['conv2d_2']['Implementation'] = 'im2col'
+        hls_config['LayerName']['conv2d_3']['Implementation'] = 'im2col'
 
-    output_dir = str(test_root_path / f"hls4mlprj_binary_cnn_{backend}_{io_type}_{strategy}")
+    output_dir = str(test_root_path / f'hls4mlprj_binary_cnn_{backend}_{io_type}_{strategy}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model2,
         hls_config=hls_config,
@@ -93,6 +93,7 @@ def test_binary_cnn(backend, io_type, strategy):
     )
 
     X = np.random.rand(100, 28, 28, 1)
+    X = np.round(X * 2**10) * 2**-10
 
     hls_model.compile()
     y = model2.predict(X)  # noqa: F841

From c351a0201fe0ae65fed8438bede03f3e695220bb Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:46:26 +0200
Subject: [PATCH 033/103] Add UnspecifiedPrecisionType

---
 hls4ml/model/types.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py
index ba926b11dc..fb5cde3863 100644
--- a/hls4ml/model/types.py
+++ b/hls4ml/model/types.py
@@ -223,6 +223,17 @@ def __str__(self):
         return typestring
 
 
+class UnspecifiedPrecisionType(PrecisionType):
+    """
+    Class representing an unspecified precision type.
+
+    Instances of this class are expected to be replaced with concrete precision types during conversion.
+    """
+
+    def __init__(self):
+        super().__init__(width=0, signed=False)
+
+
 def find_minimum_width(data, signed=True):
     """
     Helper function to find the minimum integer width to express all entries in the data array
@@ -437,7 +448,9 @@ def __next__(self):
 
     def update_precision(self, new_precision):
         self.type.precision = new_precision
-        if isinstance(new_precision, (IntegerPrecisionType, XnorPrecisionType, ExponentPrecisionType)):
+        if isinstance(new_precision, UnspecifiedPrecisionType):
+            self.precision_fmt = ''  # Temporarily set precision to undefined value
+        elif isinstance(new_precision, (IntegerPrecisionType, XnorPrecisionType, ExponentPrecisionType)):
             self.precision_fmt = '{:.0f}'
         elif isinstance(new_precision, FixedPrecisionType):
             decimal_spaces = max(0, new_precision.fractional)

From 4d9d35a32e9a12e7be1c388fd76f8364dcb1e0bc Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:49:18 +0200
Subject: [PATCH 034/103] Rudimentary optimizer to infer 'auto' precision

---
 hls4ml/backends/fpga/fpga_backend.py          |  14 +-
 hls4ml/backends/quartus/quartus_backend.py    |   2 +-
 hls4ml/backends/vivado/vivado_backend.py      |   2 +-
 hls4ml/model/optimizer/__init__.py            |   2 +
 .../model/optimizer/passes/infer_precision.py | 290 ++++++++++++++++++
 5 files changed, 307 insertions(+), 3 deletions(-)
 create mode 100644 hls4ml/model/optimizer/passes/infer_precision.py

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 8cfaec8b3f..8d0ed64aad 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -34,8 +34,10 @@
     ExponentPrecisionType,
     FixedPrecisionType,
     IntegerPrecisionType,
+    PrecisionType,
     RoundingMode,
     SaturationMode,
+    UnspecifiedPrecisionType,
     XnorPrecisionType,
 )
 from hls4ml.writer import get_writer
@@ -290,9 +292,12 @@ def get_valid_conv_partition_splits(self, out_height, out_width):
 
     @classmethod
     def convert_precision_string(cls, precision):
-        if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType):
+        if isinstance(precision, PrecisionType):
             return precision
 
+        if precision.lower() == 'auto':
+            return cls._convert_auto_type(precision)
+
         if precision.startswith('ac_'):
             return cls._convert_ac_type(precision)
         else:
@@ -366,6 +371,13 @@ def _convert_ac_type(cls, precision):
         elif 'int' in precision:
             return IntegerPrecisionType(width, signed)
 
+    @classmethod
+    def _convert_auto_type(cls, precision):
+        '''
+        Convert a "auto" precision string into the UnspecifiedPrecisionType
+        '''
+        return UnspecifiedPrecisionType()
+
     def product_type(self, data_T, weight_T):
         '''
         Helper function to determine which product implementation to use during inference
diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
index 8e81e0a2e9..cf84016aee 100644
--- a/hls4ml/backends/quartus/quartus_backend.py
+++ b/hls4ml/backends/quartus/quartus_backend.py
@@ -72,7 +72,7 @@ def _register_flows(self):
             'quartus:inplace_parallel_reshape',
             'quartus:inplace_stream_flatten',
             'quartus:skip_softmax',
-            'quartus:fix_softmax_table_size',
+            'infer_precision_types',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 6bd57d6a88..47974e10c3 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -109,7 +109,7 @@ def _register_flows(self):
             'vivado:inplace_parallel_reshape',
             'vivado:inplace_stream_flatten',
             'vivado:skip_softmax',
-            'vivado:fix_softmax_table_size',
+            'infer_precision_types',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 2e9b197475..5eab99db8a 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -33,6 +33,7 @@
 register_flow(
     'convert',
     [
+        'infer_precision_types',
         'channels_last_converter',
         'fuse_bias_add',
         'remove_useless_transpose',
@@ -51,6 +52,7 @@
         'fuse_consecutive_batch_normalization',
         'fuse_batch_normalization',
         'replace_multidimensional_dense_with_conv',
+        'infer_precision_types',
         'set_precision_concat',
     ],
     requires=['convert'],
diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
new file mode 100644
index 0000000000..5ef1c2dee5
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -0,0 +1,290 @@
+from copy import deepcopy
+
+import numpy as np
+
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType
+
+
+class InferPrecisionTypes(OptimizerPass):
+    def match(self, node):
+        for layer_type in node.types.values():
+            if isinstance(layer_type.precision, UnspecifiedPrecisionType):
+                return True
+        return False
+
+    def transform(self, model, node):
+        types_to_infer = []
+        for type_name, type_obj in node.types.items():
+            if isinstance(type_obj.precision, UnspecifiedPrecisionType):
+                types_to_infer.append(type_name)
+
+        inferred_types = self._infer_precision(node, types_to_infer)
+        for type_name in types_to_infer:
+            if type_name not in inferred_types:
+                self._infer_default_type(node, type_name)
+
+        return False  # No model graph changes made
+
+    def _infer_precision(self, node, types_to_infer):
+        node_class = node.class_name
+        if node_class in ['Dense']:
+            return self._infer_dense_precision(node, types_to_infer)
+
+        if node_class in ['BatchNormalization']:
+            return self._infer_bn_precision(node, types_to_infer)
+
+        if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']:
+            return self._infer_conv_precision(node, types_to_infer)
+
+        if node_class in ['SeparableConv1D', 'SeparableConv2D', 'DepthwiseConv2D']:
+            return self._infer_sepconv_precision(node, types_to_infer)
+
+        if node_class in ['Pooling1D', 'Pooling2D']:
+            return self._infer_pooling_precision(node, types_to_infer)
+
+        if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']:
+            return self._infer_output_matching_precision(node, types_to_infer)
+
+        if node_class in ['Concatenate', 'Merge']:
+            return self._infer_merge_precision(node, types_to_infer)
+
+        # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
+        # this in config_from_* functions
+
+        return []
+
+    def _infer_default_type(self, node, type_name):
+        model_config = node.model.config
+        default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default'])
+        # No need to change the name of the NamedType since we use the default precision
+        node.types[type_name].precision = default_precision
+
+    def _infer_output_matching_precision(self, node, types_to_infer):
+        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
+
+        in_var = node.get_input_variable()
+        out_var = node.get_output_variable()
+        in_out_type = in_var.type.precision
+        out_var.type.precision = in_out_type
+
+        return ['result_t']
+
+    def _infer_common_precision(self, node, types_to_infer, n_ops):
+        inferred_types = []
+
+        input_precision = node.get_input_variable().type.precision
+        input_width = input_precision.width
+        input_integers = input_precision.integer
+
+        if 'weight_t' in types_to_infer:
+            weight_quantizer = node.get_attr('weight_quantizer', None)
+            if weight_quantizer is not None:
+                weight_width = weight_quantizer.bits
+                weight_integers = weight_quantizer.hls_type.integer
+                node.types['weight_t'].name = node.name + '_weight_t'
+                node.types['weight_t'].precision = weight_quantizer.hls_type
+            else:
+                self._infer_default_type(node, 'weight_t')
+                weight_width = node.types['weight_t'].precision.width
+                weight_integers = node.types['weight_t'].precision.integer
+            node.weights['weight'].update_precision(node.types['weight_t'].precision)
+
+            inferred_types.append('weight_t')
+        else:
+            weight_width = node.types['weight_t'].precision.width
+            weight_integers = node.types['weight_t'].precision.integer
+
+        if 'bias_t' in types_to_infer:
+            bias_quantizer = node.get_attr('bias_quantizer', None)
+            if bias_quantizer is not None:
+                bias_width = bias_quantizer.bits
+                bias_integers = bias_quantizer.hls_type.integer
+                node.types['bias_t'].name = node.name + '_bias_t'
+                node.types['bias_t'].precision = bias_quantizer.hls_type
+            else:
+                self._infer_default_type(node, 'bias_t')
+                bias_width = node.types['bias_t'].precision.width
+                bias_integers = node.types['bias_t'].precision.integer
+            node.weights['bias'].update_precision(node.types['bias_t'].precision)
+
+            inferred_types.append('bias_t')
+        else:
+            bias_width = node.types['bias_t'].precision.width
+            bias_integers = node.types['bias_t'].precision.integer
+
+        new_type = FixedPrecisionType(
+            width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1),
+            integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1),
+        )
+
+        if 'accum_t' in types_to_infer:
+            node.types['accum_t'].name = node.name + '_accum_t'
+            node.types['accum_t'].precision = new_type
+
+            inferred_types.append('accum_t')
+
+        if 'result_t' in types_to_infer:
+            node.types['result_t'].name = node.name + '_result_t'
+            node.types['result_t'].precision = new_type
+
+            inferred_types.append('result_t')
+
+        return inferred_types
+
+    def _infer_dense_precision(self, node, types_to_infer):
+        n_ops = node.get_attr('n_in') * node.get_attr('n_out')
+        return self._infer_common_precision(node, types_to_infer, n_ops)
+
+    def _infer_conv_precision(self, node, types_to_infer):
+        n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width')
+        return self._infer_common_precision(node, types_to_infer, n_ops)
+
+    def _infer_sepconv_precision(self, node, types_to_infer):
+        inferred_types = []
+
+        input_precision = node.get_input_variable().type.precision
+        input_width = input_precision.width
+        input_integers = input_precision.integer
+
+        if 'depthwise_t' in types_to_infer:
+            # TODO Current HLS implementations use data_T (input type) as the result hence this doesn't affect the output
+            # precision ATM, but this will probably change in the future
+            depthwise_quantizer = node.get_attr('depthwise_quantizer', None)
+            if depthwise_quantizer is not None:
+                node.types['depthwise_t'].name = node.name + '_depthwise_t'
+                node.types['depthwise_t'].precision = depthwise_quantizer.hls_type
+            else:
+                self._infer_default_type(node, 'depthwise_t')
+            node.weights['depthwise'].update_precision(node.types['depthwise_t'].precision)
+
+            inferred_types.append('depthwise_t')
+
+        if 'pointwise_t' in types_to_infer:
+            pointwise_quantizer = node.get_attr('pointwise_quantizer', None)
+            if pointwise_quantizer is not None:
+                pointwise_width = pointwise_quantizer.bits
+                pointwise_integers = pointwise_quantizer.hls_type.integer
+                node.types['pointwise_t'].name = node.name + '_pointwise_t'
+                node.types['pointwise_t'].precision = pointwise_quantizer.hls_type
+            else:
+                self._infer_default_type(node, 'pointwise_t')
+                pointwise_width = node.types['pointwise_t'].precision.width
+                pointwise_integers = node.types['pointwise_t'].precision.integer
+            node.weights['pointwise'].update_precision(node.types['pointwise_t'].precision)
+
+            inferred_types.append('pointwise_t')
+        else:
+            pointwise_width = node.types['pointwise_t'].precision.width
+            pointwise_integers = node.types['pointwise_t'].precision.integer
+
+        if 'bias_t' in types_to_infer:
+            bias_quantizer = node.get_attr('bias_quantizer', None)
+            if bias_quantizer is not None:
+                bias_width = bias_quantizer.bits
+                bias_integers = bias_quantizer.hls_type.integer
+                node.types['bias_t'].name = node.name + '_bias_t'
+                node.types['bias_t'].precision = bias_quantizer.hls_type
+            else:
+                self._infer_default_type(node, 'bias_t')
+                bias_width = node.types['bias_t'].precision.width
+                bias_integers = node.types['bias_t'].precision.integer
+            node.weights['bias'].update_precision(node.types['bias_t'].precision)
+
+            inferred_types.append('bias_t')
+        else:
+            bias_width = node.types['bias_t'].precision.width
+            bias_integers = node.types['bias_t'].precision.integer
+
+        n_ops = node.get_attr('n_chan')
+        new_type = FixedPrecisionType(
+            width=int(max(np.ceil(input_width + pointwise_width + np.log2(n_ops)), bias_width) + 1),
+            integer=int(max(np.ceil(input_integers + pointwise_integers + np.log2(n_ops)), bias_integers) + 1),
+        )
+
+        if 'accum_t' in types_to_infer:
+            node.types['accum_t'].name = node.name + '_accum_t'
+            node.types['accum_t'].precision = new_type
+
+            inferred_types.append('accum_t')
+
+        if 'result_t' in types_to_infer:
+            node.types['result_t'].name = node.name + '_result_t'
+            node.types['result_t'].precision = new_type
+
+            inferred_types.append('result_t')
+
+        return inferred_types
+
+    def _infer_bn_precision(self, node, types_to_infer):
+        inferred_types = []
+
+        if 'scale_t' in types_to_infer:
+            self._infer_default_type(node, 'scale_t')
+            node.weights['scale'].update_precision(node.types['scale_t'].precision)
+            inferred_types.append('scale_t')
+
+        if 'bias_t' in types_to_infer:
+            self._infer_default_type(node, 'bias_t')
+            node.weights['bias'].update_precision(node.types['bias_t'].precision)
+            inferred_types.append('bias_t')
+
+        if 'result_t' in types_to_infer:
+            scale_precision = node.types['scale_t'].precision
+            bias_precision = node.types['bias_t'].precision
+
+            out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision)
+            out_precision.integer += scale_precision.integer
+            out_precision.fractional = max(out_precision.fractional, scale_precision.fractional)
+
+            out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1
+            out_precision.fractional = max(out_precision.fractional, bias_precision.fractional)
+            out_precision.width = out_precision.fractional + out_precision.integer
+
+            node.types['result_t'].name = node.name + '_result_t'
+            node.types['result_t'].precision = out_precision
+
+            inferred_types.append('result_t')
+
+        return inferred_types
+
+    def _infer_pooling_precision(self, node, types_to_infer):
+        inferred_types = []
+
+        if 'accum_t' in types_to_infer:
+            input_precision = node.get_input_variable().type.precision
+            input_width = input_precision.width
+            input_integers = input_precision.integer
+
+            n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width')
+
+            accum_type = FixedPrecisionType(
+                width=int(np.ceil(input_width + np.log2(n_ops)) + 1),
+                integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1),
+            )
+
+            node.types['accum_t'].name = node.name + '_accum_t'
+            node.types['accum_t'].precision = accum_type
+
+            inferred_types.append('accum_t')
+
+        if 'result_t' in types_to_infer:
+            self._infer_output_matching_precision(node, ['result_t'])
+            inferred_types.append('result_t')
+
+        return inferred_types
+
+    def _infer_merge_precision(self, node, types_to_infer):
+        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
+
+        input_1 = node.get_input_variable(node.inputs[0]).type.precision
+        input_2 = node.get_input_variable(node.inputs[1]).type.precision
+
+        new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer)
+        new_int = max(input_1.integer, input_2.integer)
+
+        out_precision = FixedPrecisionType(new_width, new_int)
+        node.types['result_t'].name = node.name + '_result_t'
+        node.types['result_t'].precision = out_precision
+
+        return ['result_t']

From 32ae9b6362c8e99538fd23e1d50fb3817170a13b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:50:00 +0200
Subject: [PATCH 035/103] Auto precision test

---
 test/pytest/test_auto_precision.py | 255 +++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 test/pytest/test_auto_precision.py

diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py
new file mode 100644
index 0000000000..8454d1a7f8
--- /dev/null
+++ b/test/pytest/test_auto_precision.py
@@ -0,0 +1,255 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+from tensorflow.keras.layers import (
+    AveragePooling1D,
+    AveragePooling2D,
+    BatchNormalization,
+    Conv1D,
+    Conv2D,
+    Dense,
+    Flatten,
+    ReLU,
+    SeparableConv1D,
+    SeparableConv2D,
+)
+from tensorflow.keras.models import Sequential
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+in_height = 10
+in_width = 12
+in_feat = 4
+
+
+@pytest.fixture(scope='module')
+def data_1d():
+    X = np.random.rand(100, in_feat)
+    return X
+
+
+@pytest.fixture(scope='module')
+def data_2d():
+    X = np.random.rand(100, in_width, in_feat)
+    return X
+
+
+@pytest.fixture(scope='module')
+def data_3d():
+    X = np.random.rand(100, in_height, in_width, in_feat)
+    return X
+
+
+@pytest.fixture(scope='module')
+def keras_model_dense():
+    model = Sequential()
+    model.add(Dense(8, activation='relu', input_shape=(in_feat,), name='first_layer'))
+    model.add(BatchNormalization(name='first_bn'))
+    model.add(Dense(6, activation='relu', name='middle_layer'))
+    model.add(BatchNormalization(name='middle_bn'))
+    model.add(Dense(4, activation='relu', name='last_layer'))
+    model.compile()
+    return model
+
+
+@pytest.fixture(scope='module')
+def keras_model_conv1d():
+    model = Sequential()
+    model.add(Conv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat)))
+    model.add(AveragePooling1D(pool_size=2, name='first_pool'))
+    model.add(ReLU(name='first_act'))
+    model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer'))
+    model.add(Conv1D(4, kernel_size=1, activation='relu', name='last_layer'))  # Will become PointwiseConv1D
+    model.add(Flatten())
+    model.add(Dense(4, activation='relu'))
+    model.compile()
+    return model
+
+
+@pytest.fixture(scope='module')
+def keras_model_conv2d():
+    model = Sequential()
+    model.add(
+        Conv2D(8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat))
+    )
+    model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool'))
+    model.add(ReLU(name='first_act'))
+    model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', name='middle_layer'))
+    model.add(Conv2D(4, kernel_size=(1, 1), activation='relu', name='last_layer'))  # Will become PointwiseConv2D
+    model.add(Flatten())
+    model.add(Dense(4, activation='relu'))
+    model.compile()
+    return model
+
+
+@pytest.fixture(scope='module')
+def keras_model_sepconv1d():
+    model = Sequential()
+    model.add(SeparableConv1D(8, kernel_size=3, activation='linear', name='first_layer', input_shape=(in_width, in_feat)))
+    model.add(AveragePooling1D(pool_size=2, name='first_pool'))
+    model.add(ReLU(name='first_act'))
+    model.add(Conv1D(4, kernel_size=2, activation='relu', name='middle_layer'))
+    model.add(Conv1D(4, kernel_size=1, activation='relu', name='last_layer'))  # Will become PointwiseConv1D
+    model.add(Flatten())
+    model.add(Dense(4, activation='relu'))
+    model.compile()
+    return model
+
+
+@pytest.fixture(scope='module')
+def keras_model_sepconv2d():
+    model = Sequential()
+    model.add(
+        SeparableConv2D(
+            8, kernel_size=(3, 3), activation='linear', name='first_layer', input_shape=(in_height, in_width, in_feat)
+        )
+    )
+    model.add(AveragePooling2D(pool_size=(2, 2), name='first_pool'))
+    model.add(ReLU(name='first_act'))
+    model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', name='middle_layer'))
+    model.add(Conv2D(4, kernel_size=(1, 1), activation='relu', name='last_layer'))  # Will become PointwiseConv2D
+    model.add(Flatten())
+    model.add(Dense(4, activation='relu'))
+    model.compile()
+    return model
+
+
+@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+@pytest.mark.parametrize('model_type', ['conv1d', 'conv2d'])
+def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, data_3d, model_type, io_type, backend):
+    if model_type == 'conv1d':
+        model = keras_model_conv1d
+        data = data_2d
+    else:
+        model = keras_model_conv2d
+        data = data_3d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model')
+    config['LayerName'] = {
+        # Infer all types of these layers
+        'first_layer': {
+            'Precision': 'auto',
+        },
+        'first_pool': {
+            'Precision': 'auto',
+        },
+        # Infer only a few specific types for these layers
+        'middle_layer': {
+            'Precision': {
+                'accum': 'auto',
+                'weight': 'auto',
+            },
+        },
+        'last_layer': {
+            'Precision': {
+                'result': 'auto',
+            },
+        },
+    }
+    odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend
+    )
+
+    # Compile will fail if there are still UnspecifiedPrecisionTypes in the model
+    hls_model.compile()
+
+    # Predict
+    y_keras = model.predict(data).flatten()
+    y_hls = hls_model.predict(data).flatten()
+    np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream'])  # Until we implement SeparableConv1D/2D for io_parallel
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])  # No SeparableConv1D/2D in Quartus
+@pytest.mark.parametrize('model_type', ['sepconv1d', 'sepconv2d'])
+def test_auto_precision_sepconv(
+    keras_model_sepconv1d, keras_model_sepconv2d, data_2d, data_3d, model_type, io_type, backend
+):
+    if model_type == 'sepconv1d':
+        model = keras_model_sepconv1d
+        data = data_2d
+    else:
+        model = keras_model_sepconv2d
+        data = data_3d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model')
+    config['LayerName'] = {
+        # Infer all types of these layers
+        'first_layer': {
+            'Precision': 'auto',
+        },
+        'first_pool': {
+            'Precision': 'auto',
+        },
+        # Infer only a few specific types for these layers
+        'middle_layer': {
+            'Precision': {
+                'accum': 'auto',
+                'weight': 'auto',
+            },
+        },
+        'last_layer': {
+            'Precision': {
+                'result': 'auto',
+            },
+        },
+    }
+    odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend
+    )
+
+    # Compile will fail if there are still UnspecifiedPrecisionTypes in the model
+    hls_model.compile()
+
+    # Predict
+    y_keras = model.predict(data).flatten()
+    y_hls = hls_model.predict(data).flatten()
+    np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream', 'io_parallel'])
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+def test_auto_precision_dense(keras_model_dense, data_1d, io_type, backend):
+    model = keras_model_dense
+    data = data_1d
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<16,6>', granularity='model')
+    config['LayerName'] = {
+        # Infer all types of these layers
+        'first_layer': {
+            'Precision': 'auto',
+        },
+        'first_bn': {
+            'Precision': 'auto',
+        },
+        # Infer only a few specific types for these layers
+        'middle_layer': {
+            'Precision': {
+                'accum': 'auto',
+                'weight': 'auto',
+            },
+        },
+        'last_layer': {
+            'Precision': {
+                'result': 'auto',
+            },
+        },
+    }
+    odir = str(test_root_path / f'hls4mlprj_auto_dense_{backend}_{io_type}')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend
+    )
+
+    # Compile will fail if there are still UnspecifiedPrecisionTypes in the model
+    hls_model.compile()
+
+    # Predict
+    y_keras = model.predict(data).flatten()
+    y_hls = hls_model.predict(data).flatten()
+    np.testing.assert_allclose(y_keras, y_hls, rtol=2e-2, atol=5e-2, verbose=True)

From 932b01e235a0ea2e3a7183e15fa41c7cfb8409de Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:50:18 +0200
Subject: [PATCH 036/103] Sepconv fixes

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 97972be36a..c990f08be0 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -285,6 +285,7 @@ def format(self, node):
         params['nzeros'] = node.get_weights('depthwise').nzeros
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
+        params['bias_t'] = node.get_weights('zero_bias').type
         params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):

From 6a65fed1258c8f7070dc198c9eb49095b7f47511 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 26 Jan 2024 16:55:42 -0600
Subject: [PATCH 037/103] update precision propagation for signed, select
 im2col for quartus parallel conv

---
 .../model/optimizer/passes/infer_precision.py | 49 ++++++++++---------
 test/pytest/test_auto_precision.py            |  7 +++
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 5ef1c2dee5..d2e166b557 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -1,3 +1,4 @@
+import math
 from copy import deepcopy
 
 import numpy as np
@@ -70,53 +71,57 @@ def _infer_output_matching_precision(self, node, types_to_infer):
 
         return ['result_t']
 
-    def _infer_common_precision(self, node, types_to_infer, n_ops):
+    def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights=False):
+        '''The option, use_given_weights, allows you to tailor for the given weights, in particular, zero bias'''
         inferred_types = []
 
         input_precision = node.get_input_variable().type.precision
         input_width = input_precision.width
         input_integers = input_precision.integer
+        input_signed = input_precision.signed
 
         if 'weight_t' in types_to_infer:
             weight_quantizer = node.get_attr('weight_quantizer', None)
             if weight_quantizer is not None:
-                weight_width = weight_quantizer.bits
-                weight_integers = weight_quantizer.hls_type.integer
                 node.types['weight_t'].name = node.name + '_weight_t'
                 node.types['weight_t'].precision = weight_quantizer.hls_type
             else:
                 self._infer_default_type(node, 'weight_t')
-                weight_width = node.types['weight_t'].precision.width
-                weight_integers = node.types['weight_t'].precision.integer
             node.weights['weight'].update_precision(node.types['weight_t'].precision)
-
             inferred_types.append('weight_t')
-        else:
-            weight_width = node.types['weight_t'].precision.width
-            weight_integers = node.types['weight_t'].precision.integer
+
+        weight_width = node.types['weight_t'].precision.width
+        weight_integers = node.types['weight_t'].precision.integer
+        weight_signed = node.types['weight_t'].precision.signed
 
         if 'bias_t' in types_to_infer:
             bias_quantizer = node.get_attr('bias_quantizer', None)
             if bias_quantizer is not None:
-                bias_width = bias_quantizer.bits
-                bias_integers = bias_quantizer.hls_type.integer
                 node.types['bias_t'].name = node.name + '_bias_t'
                 node.types['bias_t'].precision = bias_quantizer.hls_type
             else:
                 self._infer_default_type(node, 'bias_t')
-                bias_width = node.types['bias_t'].precision.width
-                bias_integers = node.types['bias_t'].precision.integer
             node.weights['bias'].update_precision(node.types['bias_t'].precision)
-
             inferred_types.append('bias_t')
-        else:
-            bias_width = node.types['bias_t'].precision.width
-            bias_integers = node.types['bias_t'].precision.integer
 
-        new_type = FixedPrecisionType(
-            width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1),
-            integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1),
-        )
+        bias_width = node.types['bias_t'].precision.width
+        bias_integers = node.types['bias_t'].precision.integer
+        bias_signed = node.types['bias_t'].precision.signed
+        no_bias = node.weights['bias'].nonzeros == 0 and use_given_weights  # no bias
+
+        # using math.ceil instead of np.ceil because it returns an int
+        bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops))
+        integers = weight_integers + input_integers + math.ceil(np.log2(n_ops))
+        signed = weight_signed or input_signed
+
+        frac = bitwidth - integers
+
+        if not no_bias:
+            integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1
+            bitwidth = integers + max(frac, bias_width - bias_integers)
+            signed = signed or bias_signed
+
+        new_type = FixedPrecisionType(bitwidth, integers, signed)
 
         if 'accum_t' in types_to_infer:
             node.types['accum_t'].name = node.name + '_accum_t'
@@ -133,7 +138,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops):
         return inferred_types
 
     def _infer_dense_precision(self, node, types_to_infer):
-        n_ops = node.get_attr('n_in') * node.get_attr('n_out')
+        n_ops = node.get_attr('n_in')
         return self._infer_common_precision(node, types_to_infer, n_ops)
 
     def _infer_conv_precision(self, node, types_to_infer):
diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py
index 8454d1a7f8..cbb74aa12f 100644
--- a/test/pytest/test_auto_precision.py
+++ b/test/pytest/test_auto_precision.py
@@ -150,6 +150,13 @@ def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, da
             },
         },
     }
+
+    # Winograd is not bit-accurate, so avoid it.
+    if backend == 'Quartus' and io_type == 'io_parallel':
+        config["LayerName"]["first_layer"]["Implementation"] = "im2col"
+        config["LayerName"]["middle_layer"]["Implementation"] = "im2col"
+        config["LayerName"]["last_layer"]["Implementation"] = "im2col"
+
     odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, io_type=io_type, output_dir=odir, backend=backend

From 41b7e98d6386bed647305ace39ff0d07c2599905 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 6 Feb 2024 22:24:37 +0100
Subject: [PATCH 038/103] Make inferring no_bias a configurable option of the
 optimizer

---
 hls4ml/model/optimizer/passes/infer_precision.py | 13 ++++++++-----
 test/pytest/test_auto_precision.py               |  6 +++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index d2e166b557..6f6a72097f 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -3,11 +3,15 @@
 
 import numpy as np
 
-from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.optimizer import ConfigurableOptimizerPass
 from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType
 
 
-class InferPrecisionTypes(OptimizerPass):
+class InferPrecisionTypes(ConfigurableOptimizerPass):
+    def __init__(self):
+        # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias
+        self.infer_no_bias = False
+
     def match(self, node):
         for layer_type in node.types.values():
             if isinstance(layer_type.precision, UnspecifiedPrecisionType):
@@ -71,8 +75,7 @@ def _infer_output_matching_precision(self, node, types_to_infer):
 
         return ['result_t']
 
-    def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights=False):
-        '''The option, use_given_weights, allows you to tailor for the given weights, in particular, zero bias'''
+    def _infer_common_precision(self, node, types_to_infer, n_ops):
         inferred_types = []
 
         input_precision = node.get_input_variable().type.precision
@@ -107,7 +110,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops, use_given_weights
         bias_width = node.types['bias_t'].precision.width
         bias_integers = node.types['bias_t'].precision.integer
         bias_signed = node.types['bias_t'].precision.signed
-        no_bias = node.weights['bias'].nonzeros == 0 and use_given_weights  # no bias
+        no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias  # no bias
 
         # using math.ceil instead of np.ceil because it returns an int
         bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops))
diff --git a/test/pytest/test_auto_precision.py b/test/pytest/test_auto_precision.py
index cbb74aa12f..356be1031a 100644
--- a/test/pytest/test_auto_precision.py
+++ b/test/pytest/test_auto_precision.py
@@ -153,9 +153,9 @@ def test_auto_precision_conv(keras_model_conv1d, keras_model_conv2d, data_2d, da
 
     # Winograd is not bit-accurate, so avoid it.
     if backend == 'Quartus' and io_type == 'io_parallel':
-        config["LayerName"]["first_layer"]["Implementation"] = "im2col"
-        config["LayerName"]["middle_layer"]["Implementation"] = "im2col"
-        config["LayerName"]["last_layer"]["Implementation"] = "im2col"
+        config['LayerName']['first_layer']['Implementation'] = 'im2col'
+        config['LayerName']['middle_layer']['Implementation'] = 'im2col'
+        config['LayerName']['last_layer']['Implementation'] = 'im2col'
 
     odir = str(test_root_path / f'hls4mlprj_auto_{model_type}_{backend}_{io_type}')
     hls_model = hls4ml.converters.convert_from_keras_model(

From 24253e1b724f228f287447873eca53dbaf4e3644 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 16 Apr 2024 15:49:21 -0500
Subject: [PATCH 039/103] updates to infering precision from qonnx branch

---
 .../model/optimizer/passes/infer_precision.py | 141 ++++++++++++++++--
 1 file changed, 128 insertions(+), 13 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 6f6a72097f..4de58a18c2 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -1,18 +1,29 @@
 import math
-from copy import deepcopy
 
 import numpy as np
 
 from hls4ml.model.optimizer import ConfigurableOptimizerPass
 from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType
 
+# TODO:  The code assumes everything is Fixed or Integer precision. Need to add checks
+
 
 class InferPrecisionTypes(ConfigurableOptimizerPass):
     def __init__(self):
         # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias
         self.infer_no_bias = False
+        self.count = 0
+        self.MAX_COUNT = 1000
 
     def match(self, node):
+        input_var = node.get_input_variable()
+        if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType):
+            # need to wait for the input to update
+            # but check for infinite loops
+            self.count += 1
+            if self.count == self.MAX_COUNT:
+                raise RuntimeError("There is an infinite loop in the precision inference.")
+            return False
         for layer_type in node.types.values():
             if isinstance(layer_type.precision, UnspecifiedPrecisionType):
                 return True
@@ -29,14 +40,14 @@ def transform(self, model, node):
             if type_name not in inferred_types:
                 self._infer_default_type(node, type_name)
 
-        return False  # No model graph changes made
+        return True  # May need to rerun
 
     def _infer_precision(self, node, types_to_infer):
         node_class = node.class_name
         if node_class in ['Dense']:
             return self._infer_dense_precision(node, types_to_infer)
 
-        if node_class in ['BatchNormalization']:
+        if node_class in ['BatchNormalization', 'ApplyAlpha']:
             return self._infer_bn_precision(node, types_to_infer)
 
         if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']:
@@ -51,14 +62,24 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']:
             return self._infer_output_matching_precision(node, types_to_infer)
 
-        if node_class in ['Concatenate', 'Merge']:
+        if node_class in ['Merge']:
             return self._infer_merge_precision(node, types_to_infer)
 
+        if node_class in ['Concatenate']:
+            return self._infer_cat_precision(node, types_to_infer)
+
+        if node_class in ['Dot']:
+            return self._infer_dot_precision(node, types_to_infer)
+
         # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
         # this in config_from_* functions
 
         return []
 
+    def _get_default_precision(self, node):
+        model_config = node.model.config
+        return model_config.backend.convert_precision_string(model_config.model_precision['default'])
+
     def _infer_default_type(self, node, type_name):
         model_config = node.model.config
         default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default'])
@@ -124,6 +145,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops):
             bitwidth = integers + max(frac, bias_width - bias_integers)
             signed = signed or bias_signed
 
+        # Note:  this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form.
         new_type = FixedPrecisionType(bitwidth, integers, signed)
 
         if 'accum_t' in types_to_infer:
@@ -225,6 +247,11 @@ def _infer_sepconv_precision(self, node, types_to_infer):
         return inferred_types
 
     def _infer_bn_precision(self, node, types_to_infer):
+        """
+        The batchnormalziation precision here is the more implementation-focused version. It propagates
+        precision from scale and bias, not mean, variance, etc.
+        """
+
         inferred_types = []
 
         if 'scale_t' in types_to_infer:
@@ -238,16 +265,28 @@ def _infer_bn_precision(self, node, types_to_infer):
             inferred_types.append('bias_t')
 
         if 'result_t' in types_to_infer:
+            input_precision = node.get_input_variable().type.precision
             scale_precision = node.types['scale_t'].precision
             bias_precision = node.types['bias_t'].precision
 
-            out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision)
-            out_precision.integer += scale_precision.integer
-            out_precision.fractional = max(out_precision.fractional, scale_precision.fractional)
+            after_scale_signed = scale_precision.signed or input_precision.signed
+            after_scale_width = input_precision.width + scale_precision.width
+            after_scale_integer = input_precision.integer + scale_precision.integer
+
+            out_precision_signed = after_scale_signed or bias_precision.signed
+            out_precision_integer = (
+                max(
+                    after_scale_integer + (bias_precision.signed and not after_scale_signed),
+                    bias_precision.integer + (after_scale_signed and not bias_precision.signed),
+                )
+                + 1
+            )
+            out_precision_width = out_precision_integer + max(
+                after_scale_width - after_scale_integer, bias_precision.fractional
+            )
 
-            out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1
-            out_precision.fractional = max(out_precision.fractional, bias_precision.fractional)
-            out_precision.width = out_precision.fractional + out_precision.integer
+            # Note:  this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form.
+            out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed)
 
             node.types['result_t'].name = node.name + '_result_t'
             node.types['result_t'].precision = out_precision
@@ -288,10 +327,86 @@ def _infer_merge_precision(self, node, types_to_infer):
         input_1 = node.get_input_variable(node.inputs[0]).type.precision
         input_2 = node.get_input_variable(node.inputs[1]).type.precision
 
-        new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer)
-        new_int = max(input_1.integer, input_2.integer)
+        op = node.get_attr('op').lower()
+        if op in ('add', 'subtract', 'average'):
+            new_signed = input_1.signed or input_2.signed or op == 'subtract'
+            new_int = (
+                max(
+                    input_1.integer + (input_2.signed and not input_1.signed),
+                    input_2.integer + (input_1.signed and not input_2.signed),
+                )
+                + 1
+            )
+            new_width = new_int + max(input_1.fractional, input_2.fractional)
+            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
+        elif op == 'multiply':
+            new_signed = input_1.signed or input_2.signed
+            new_int = input_1.integer + input_2.integer
+            new_width = input_1.width + input_2.width
+            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
+        elif op in ('maximum', 'minimum'):
+            new_signed = input_1.signed or input_2.signed
+
+            input_1_integer = input_1.integer
+            input_2_integer = input_2.integer
+
+            # add one to integer if unsigned while new is signed
+            if new_signed and not input_1.signed:
+                input_1_integer += 1
+            if new_signed and not input_2.signed:
+                input_2_integer += 1
+
+            new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer)
+            new_int = max(input_1_integer, input_2_integer)
+            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
+        else:
+            print(f'Warning: not propagating weights for type {op}')
+            out_precision = self._get_default_precision(node)
+
+        node.types['result_t'].name = node.name + '_result_t'
+        node.types['result_t'].precision = out_precision
+
+        return ['result_t']
+
+    def _infer_cat_precision(self, node, types_to_infer):
+        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
+
+        input_1 = node.get_input_variable(node.inputs[0]).type.precision
+        input_2 = node.get_input_variable(node.inputs[1]).type.precision
+
+        new_signed = input_1.signed or input_2.signed
+
+        input_1_integer = input_1.integer
+        input_2_integer = input_2.integer
+
+        # add one to integer if unsigned while new is signed
+        if new_signed and not input_1.signed:
+            input_1_integer += 1
+        if new_signed and not input_2.signed:
+            input_2_integer += 1
+
+        new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer)
+        new_int = max(input_1_integer, input_2_integer)
+
+        out_precision = FixedPrecisionType(new_width, new_int, new_signed)
+        node.types['result_t'].name = node.name + '_result_t'
+        node.types['result_t'].precision = out_precision
+
+        return ['result_t']
+
+    def _infer_dot_precision(self, node, types_to_infer):
+        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
+
+        input_1 = node.get_input_variable(node.inputs[0]).type.precision
+        input_2 = node.get_input_variable(node.inputs[1]).type.precision
+
+        n_in = node.get_input_variable(node.inputs[0]).shape[0]
+
+        new_signed = input_1.signed or input_2.signed
+        new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in))
+        new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in))
 
-        out_precision = FixedPrecisionType(new_width, new_int)
+        out_precision = FixedPrecisionType(new_width, new_int, new_signed)
         node.types['result_t'].name = node.name + '_result_t'
         node.types['result_t'].precision = out_precision
 

From 6ee81890973dfa92412b946ea17ebccdfbff6303 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 17 Apr 2024 09:59:54 -0500
Subject: [PATCH 040/103] remove count, become more selective on when True is
 returned

---
 hls4ml/model/optimizer/passes/infer_precision.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 4de58a18c2..ee585c42d6 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -12,17 +12,11 @@ class InferPrecisionTypes(ConfigurableOptimizerPass):
     def __init__(self):
         # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias
         self.infer_no_bias = False
-        self.count = 0
-        self.MAX_COUNT = 1000
 
     def match(self, node):
         input_var = node.get_input_variable()
         if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType):
-            # need to wait for the input to update
-            # but check for infinite loops
-            self.count += 1
-            if self.count == self.MAX_COUNT:
-                raise RuntimeError("There is an infinite loop in the precision inference.")
+            # only infer types if the input type is known
             return False
         for layer_type in node.types.values():
             if isinstance(layer_type.precision, UnspecifiedPrecisionType):
@@ -40,7 +34,9 @@ def transform(self, model, node):
             if type_name not in inferred_types:
                 self._infer_default_type(node, type_name)
 
-        return True  # May need to rerun
+        # if the return type was set, this may allow InferPrecisionTypes to be run
+        # on layers it was not previously able to
+        return 'result_t' in types_to_infer
 
     def _infer_precision(self, node, types_to_infer):
         node_class = node.class_name

From b5add0caefe02dfd2412dfa3355d97f8a0a39980 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 16 Apr 2024 17:08:49 -0700
Subject: [PATCH 041/103] fix pooling precision

---
 .../model/optimizer/passes/infer_precision.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index ee585c42d6..a38f61914a 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -296,15 +296,21 @@ def _infer_pooling_precision(self, node, types_to_infer):
 
         if 'accum_t' in types_to_infer:
             input_precision = node.get_input_variable().type.precision
-            input_width = input_precision.width
-            input_integers = input_precision.integer
+            pool_op = node.attributes['pool_op'].lower()
 
-            n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width')
+            width: int = input_precision.width
+            integer: int = input_precision.integer
+            signed: bool = input_precision.signed
 
-            accum_type = FixedPrecisionType(
-                width=int(np.ceil(input_width + np.log2(n_ops)) + 1),
-                integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1),
-            )
+            pool_size: int = node.get_attr('pool_height', 1) * node.get_attr('pool_width')
+            if pool_op == 'avg':
+                extra_bits = int(np.ceil(np.log2(pool_size)))
+            elif pool_op == 'max':
+                extra_bits = 0
+            else:
+                raise ValueError(f'Unknown pooling operation: {pool_op}')
+
+            accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed)
 
             node.types['accum_t'].name = node.name + '_accum_t'
             node.types['accum_t'].precision = accum_type

From 665c904aee185e0235b96496a1165ca2f581e702 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Wed, 17 Apr 2024 11:07:24 -0700
Subject: [PATCH 042/103] remove typing

---
 hls4ml/model/optimizer/passes/infer_precision.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index a38f61914a..3bc3a64772 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -298,11 +298,11 @@ def _infer_pooling_precision(self, node, types_to_infer):
             input_precision = node.get_input_variable().type.precision
             pool_op = node.attributes['pool_op'].lower()
 
-            width: int = input_precision.width
-            integer: int = input_precision.integer
-            signed: bool = input_precision.signed
+            width = input_precision.width
+            integer = input_precision.integer
+            signed = input_precision.signed
 
-            pool_size: int = node.get_attr('pool_height', 1) * node.get_attr('pool_width')
+            pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width')
             if pool_op == 'avg':
                 extra_bits = int(np.ceil(np.log2(pool_size)))
             elif pool_op == 'max':

From b366d2488f0bb78067a41f2c34152ab6b4c63ab4 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Thu, 18 Apr 2024 21:13:42 +0200
Subject: [PATCH 043/103] Fix avg pooling op check

---
 hls4ml/model/optimizer/passes/infer_precision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 3bc3a64772..51422c534e 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -303,7 +303,7 @@ def _infer_pooling_precision(self, node, types_to_infer):
             signed = input_precision.signed
 
             pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width')
-            if pool_op == 'avg':
+            if pool_op == 'average':
                 extra_bits = int(np.ceil(np.log2(pool_size)))
             elif pool_op == 'max':
                 extra_bits = 0

From f0ca86597aaee34ecf2dba4c22a4b0a230c666fd Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:39:00 +0100
Subject: [PATCH 044/103] Optimizer to remove expensive Transpose that serves
 as Flatten

---
 hls4ml/model/optimizer/__init__.py            |  1 +
 .../passes/convert_to_channels_last.py        | 47 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 5eab99db8a..1a150e727d 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -35,6 +35,7 @@
     [
         'infer_precision_types',
         'channels_last_converter',
+        'remove_transpose_before_flatten',
         'fuse_bias_add',
         'remove_useless_transpose',
         'expand_layer_group',
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index 9c19711569..98ae549be5 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -2,8 +2,9 @@
 # Based on https://github.com/fastmachinelearning/qonnx/blob/
 # 12c96a3ded06beacab08e0f554e4ed014476c0aa/src/qonnx/transformation/channels_last.py
 
-from hls4ml.model.layers import Concatenate, Input, Reshape
+from hls4ml.model.layers import Concatenate, Dense, Input, Reshape, Transpose
 from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import WeightVariable
 
 
 class ChannelsLastConverter(OptimizerPass):
@@ -133,3 +134,47 @@ def transform(self, model, node):
 
         node.channels_last_converted = True
         return True
+
+
+class RemoveTransposeBeforeFlatten(OptimizerPass):
+    '''After the channels last conversion, model may have a sequence: Transpose -> Flatten -> Dense.
+    In this case we can remove the expensive transpose and instead transpose the weights of the Dense layer.'''
+
+    def match(self, node):
+        if node.model.config.get_config_value('IOType') != 'io_parallel':
+            return False
+
+        if isinstance(node, Reshape):
+            input_node = node.get_input_node()
+            output_nodes = node.get_output_nodes()
+            if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \
+                and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense):
+                return True
+        
+        return False
+    
+    def transform(self, model, node):
+        transpose_node = node.get_input_node()
+        dense_node = node.get_output_nodes()[0]
+        input_shape = transpose_node.get_output_variable().shape
+
+        weight_var = dense_node.get_weights('weight')
+        # Transpose the weights to achieve the same computation with transposed input
+        weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3)
+        weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1])
+        new_weight_var = WeightVariable(
+            var_name=weight_var.name,
+            type_name=weight_var.type.name,
+            precision=weight_var.type.precision,
+            quantizer=weight_var.quantizer,
+            data=weight_data_t,
+            index=dense_node.index
+        )
+        
+        # Update the weight variable of the node
+        dense_node.set_attr('weight', new_weight_var)
+
+        # Get rid of the Transpose node
+        model.remove_node(transpose_node)
+
+        return True
\ No newline at end of file

From 1e416b5cdb3ee9cb4a75577347fab5820612c731 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 27 Feb 2024 21:09:04 +0100
Subject: [PATCH 045/103] Generalize removal of Transpose after flatten so it
 works on 1D as well

---
 .../passes/convert_to_channels_last.py        | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index 98ae549be5..01e949086e 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -144,23 +144,38 @@ def match(self, node):
         if node.model.config.get_config_value('IOType') != 'io_parallel':
             return False
 
+        if hasattr(node, '_channels_last_keep_transpose') and node._channels_last_keep_transpose:
+            return False
+
         if isinstance(node, Reshape):
             input_node = node.get_input_node()
             output_nodes = node.get_output_nodes()
-            if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \
-                and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense):
+            if (
+                len(node.get_attr('target_shape')) == 1
+                and isinstance(input_node, Transpose)
+                and len(output_nodes) == 1
+                and isinstance(output_nodes[0], Dense)
+            ):
                 return True
-        
+
         return False
-    
+
     def transform(self, model, node):
         transpose_node = node.get_input_node()
         dense_node = node.get_output_nodes()[0]
         input_shape = transpose_node.get_output_variable().shape
 
+        if len(input_shape) == 2:  # Usually after Conv1D
+            tran_axis = [1, 0, 2]
+        elif len(input_shape) == 3:  # Usually after Conv2D
+            tran_axis = [1, 2, 0, 3]
+        else:  # In this case we bail
+            node._channels_last_keep_transpose = True
+            return False
+
         weight_var = dense_node.get_weights('weight')
         # Transpose the weights to achieve the same computation with transposed input
-        weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3)
+        weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(*tran_axis)
         weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1])
         new_weight_var = WeightVariable(
             var_name=weight_var.name,
@@ -168,13 +183,13 @@ def transform(self, model, node):
             precision=weight_var.type.precision,
             quantizer=weight_var.quantizer,
             data=weight_data_t,
-            index=dense_node.index
+            index=dense_node.index,
         )
-        
+
         # Update the weight variable of the node
         dense_node.set_attr('weight', new_weight_var)
 
         # Get rid of the Transpose node
         model.remove_node(transpose_node)
 
-        return True
\ No newline at end of file
+        return True

From 2a5d8de2134c7779e617cb981b49108eb99fe45e Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 27 Feb 2024 21:09:35 +0100
Subject: [PATCH 046/103] Remove transpose of input if n_chan=1

---
 hls4ml/model/optimizer/__init__.py            |  3 +-
 .../model/optimizer/passes/transpose_opt.py   | 40 +++++++++--
 test/pytest/test_pytorch_api.py               | 69 +++++++++++++++++++
 3 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 1a150e727d..3aa247d03f 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -36,8 +36,9 @@
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',
+        'remove_nop_transpose',
+        'remove_single_channel_transpose',
         'fuse_bias_add',
-        'remove_useless_transpose',
         'expand_layer_group',
         'output_rounding_saturation_mode',
         'qkeras_factorize_alpha',
diff --git a/hls4ml/model/optimizer/passes/transpose_opt.py b/hls4ml/model/optimizer/passes/transpose_opt.py
index a5bff8a703..1c0328fb34 100644
--- a/hls4ml/model/optimizer/passes/transpose_opt.py
+++ b/hls4ml/model/optimizer/passes/transpose_opt.py
@@ -1,21 +1,47 @@
-from hls4ml.model.layers import Transpose
+from hls4ml.model.layers import Input, Transpose
 from hls4ml.model.optimizer import OptimizerPass
 
 
-class RemoveUselessTranspose(OptimizerPass):
+class RemoveNopTranspose(OptimizerPass):
+    """
+    Remove a transpose layer if it doesn't do anything to a 1D array. i.e, 1D input and perm = [0]
+    """
+
     def match(self, node):
         is_match = isinstance(node, Transpose) and node.get_attr('perm') == [0]  # Useless transpose
         return is_match
 
     def transform(self, model, node):
-        """
-        Remove a transpose layer if it doesn't do anything. i.e 1D input and perm = [0]
-        """
-        print(f"Unnessary {node.name} in the model, optimizing ...")
+        print(f'Unnecessary transpose node ({node.name}) detected, optimizing ...')
         if not node.get_output_nodes():
-            print(f"WARNING: {node.name} is the output layer! No rewiring performed.")
+            print(f'WARNING: {node.name} is the output layer! No rewiring performed.')
             model.remove_node(node, rewire=False)  # Don't rewire if there is no output layer
         else:
             model.remove_node(node, rewire=True)
 
         return True
+
+
+class RemoveSingleChannelTranspose(OptimizerPass):
+    """
+    Remove transpose of inputs if the number of channels is 1 as for io_parallel this doesn't affect the array
+    representation used
+    """
+
+    def match(self, node):
+        if node.model.config.get_config_value('IOType') != 'io_parallel':
+            return False
+
+        return (
+            isinstance(node, Transpose)
+            and isinstance(node.get_input_node(), Input)
+            and node.get_input_variable().shape[0] == 1
+        )
+
+    def transform(self, model, node):
+        # Adjust the input shape and remove the Transpose node
+        input_var = node.get_input_variable()
+        input_var.shape.append(input_var.shape.pop(0))
+        model.remove_node(node)
+
+        return True
diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index f9bc175ca2..f5985d0dab 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -740,3 +740,72 @@ def test_skipped_layers(backend, io_type):
     hls_prediction = hls_model.predict(hls_input).flatten()
 
     np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+@pytest.mark.parametrize('io_type', ['io_parallel'])  # Only io_parallel for now
+@pytest.mark.parametrize('tensor_rank', [2, 3])
+def test_remove_transpose(backend, io_type, tensor_rank):
+    class TestModel(nn.Module):
+        def __init__(self, tensor_rank):
+            super().__init__()
+            if tensor_rank == 2:
+                self.conv1 = nn.Conv1d(in_channels=1, out_channels=4, kernel_size=3, bias=False)
+                self.relu1 = nn.ReLU()
+                self.flatten = nn.Flatten()
+                self.fc1 = nn.Linear(in_features=4 * 6, out_features=5, bias=False)
+                self.relu2 = nn.ReLU()
+            else:
+                self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, bias=False)
+                self.relu1 = nn.ReLU()
+                self.flatten = nn.Flatten()
+                self.fc1 = nn.Linear(in_features=4 * 6 * 6, out_features=5, bias=False)
+                self.relu2 = nn.ReLU()
+
+        def forward(self, x):
+            # In the hls4ml model, there should be a Transpose node on the input tensor before conv1
+            x = self.conv1(x)
+            x = self.relu1(x)
+            x = self.flatten(x)  # This should result in a Transpose node that we aim to remove
+            x = self.fc1(x)
+            x = self.relu2(x)
+            return x
+
+    model = TestModel(tensor_rank=tensor_rank)
+    if tensor_rank == 2:
+        input_shape = (1, 8)
+        input_tensor = torch.randn(10, 1, 8)
+        hls_input = np.ascontiguousarray(torch.permute(input_tensor, (0, 2, 1)).detach().numpy())
+    else:
+        input_shape = (1, 8, 8)
+        input_tensor = torch.randn(10, 1, 8, 8)
+        hls_input = np.ascontiguousarray(torch.permute(input_tensor, (0, 2, 3, 1)).detach().numpy())
+
+    batch_input_shape = (None,) + input_shape
+    config = config_from_pytorch_model(
+        model,
+        default_precision='ap_fixed<32,16>',
+        inputs_channel_last=False,  # Crucial for testing if the first Transpose was removed
+        transpose_outputs=False,
+    )
+    output_dir = str(test_root_path / f'hls4mlprj_pytorch_api_transpose_nop_{tensor_rank}d_{backend}_{io_type}')
+    hls_model = convert_from_pytorch_model(
+        model,
+        batch_input_shape,
+        hls_config=config,
+        output_dir=output_dir,
+        io_type=io_type,
+        backend=backend,
+    )
+
+    hls_model.compile()
+
+    # Test optimizers removed the two Transpose layers
+    transpose_layers = [layer for layer in list(hls_model.get_layers()) if layer.class_name == 'Transpose']
+    assert len(transpose_layers) == 0
+
+    # Test predictions match
+    pytorch_prediction = model(input_tensor).detach().numpy().flatten()
+    hls_prediction = hls_model.predict(hls_input).flatten()
+
+    np.testing.assert_allclose(hls_prediction, pytorch_prediction, rtol=0, atol=5e-2)

From 3969523dd45e624a2e56eabdc4724a6fc90fc9ad Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 13 May 2024 19:09:47 +0200
Subject: [PATCH 047/103] SepConv1d/2d for io_parallel w/ Latency strategy

---
 hls4ml/backends/fpga/passes/codegen.py        | 80 +++++++++++++++--
 .../vivado/passes/convolution_templates.py    | 24 +++--
 .../vivado/nnet_utils/nnet_sepconv1d.h        | 46 ++++++++++
 .../nnet_utils/nnet_sepconv1d_latency.h       | 86 ++++++++++++++++++
 .../vivado/nnet_utils/nnet_sepconv2d.h        | 51 +++++++++++
 .../nnet_utils/nnet_sepconv2d_latency.h       | 87 +++++++++++++++++++
 6 files changed, 360 insertions(+), 14 deletions(-)
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h

diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
index f1f1080996..b8f367137b 100644
--- a/hls4ml/backends/fpga/passes/codegen.py
+++ b/hls4ml/backends/fpga/passes/codegen.py
@@ -1,4 +1,4 @@
-from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import Source
 
@@ -7,16 +7,27 @@ class GenerateConvIm2col(OptimizerPass):
     '''Generates tcode for im2col step of 1D/2d convolution'''
 
     def match(self, node):
-        return isinstance(node, (Conv1D, Conv2D)) and node.model.config.get_config_value('IOType') == 'io_parallel'
+        return (
+            isinstance(node, (Conv1D, Conv2D, SeparableConv1D, SeparableConv2D))
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
 
     def transform(self, model, node):
-        node_class = node.__class__.__name__
-        if '1D' in node_class:
-            self._generate_im2col_1d(node)
-        elif '2D' in node_class:
-            self._generate_im2col_2d(node)
+        node_class = node.class_name
+        if 'Separable' in node_class:
+            if '1D' in node_class:
+                self._generate_separable_im2col_1d(node)
+            elif '2D' in node_class:
+                self._generate_separable_im2col_2d(node)
+            else:
+                raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
         else:
-            raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
+            if '1D' in node_class:
+                self._generate_im2col_1d(node)
+            elif '2D' in node_class:
+                self._generate_im2col_2d(node)
+            else:
+                raise Exception(f'Cannot generate instructions for node {node.name} ({node_class})')
 
     def _generate_im2col_1d(self, node):
         code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
@@ -49,3 +60,56 @@ def _generate_im2col_2d(self, node):
         )
 
         node.set_attr('line_buffer_codegen', Source(code_str))
+
+    def _generate_separable_im2col_1d(self, node):
+        dw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
+            str(node.get_attr('index')) + '_dw',
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            kernel=node.get_attr('filt_width'),
+            stride=node.get_attr('stride_width'),
+            pad=(node.get_attr('pad_left'), node.get_attr('pad_right')),
+        )
+
+        node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
+
+        pw_code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
+            str(node.get_attr('index')) + '_pw',
+            node.get_attr('n_partitions'),
+            node.get_output_variable().shape[0],
+            node.get_output_variable().shape[1],
+            kernel=1,
+        )
+
+        node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))
+
+    def _generate_separable_im2col_2d(self, node):
+        dw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
+            str(node.get_attr('index')) + '_dw',
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
+            stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
+            pad=(
+                node.get_attr('pad_top'),
+                node.get_attr('pad_bottom'),
+                node.get_attr('pad_left'),
+                node.get_attr('pad_right'),
+            ),
+        )
+
+        node.set_attr('dw_line_buffer_codegen', Source(dw_code_str))
+
+        pw_code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
+            str(node.get_attr('index')) + '_pw',
+            node.get_attr('n_partitions'),
+            node.get_output_variable().shape[0],
+            node.get_output_variable().shape[1],
+            node.get_input_variable().shape[2],
+            kernel=(1, 1),
+        )
+
+        node.set_attr('pw_line_buffer_codegen', Source(pw_code_str))
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index c990f08be0..43a8b4fb7d 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -254,8 +254,8 @@ def __init__(self):
     '{input}, {output}, {d}, {p}, {z}, {b});'
 )
 
-sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
-sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
+sepconv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_sepconv1d.h', 'nnet_utils/nnet_sepconv1d_stream.h']
+sepconv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_sepconv2d.h', 'nnet_utils/nnet_sepconv2d_stream.h']
 
 
 class SeparableConv1DConfigTemplate(LayerConfigTemplate):
@@ -286,7 +286,10 @@ def format(self, node):
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
         params['bias_t'] = node.get_weights('zero_bias').type
-        params['fill_fn'] = 'FillConv1DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):
             params['scale_index_type'] = 'scale_index_unscaled'
@@ -323,7 +326,10 @@ def format(self, node):
         params['weight_t'] = node.get_weights('pointwise').type
         params['min_width'] = params['in_width']
         params['instructions'] = '0'
-        params['fill_fn'] = 'FillConv1DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):
             params['scale_index_type'] = 'scale_index_unscaled'
@@ -402,7 +408,10 @@ def format(self, node):
         params['nzeros'] = node.get_weights('depthwise').nzeros
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
-        params['fill_fn'] = 'FillConv2DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
 
         if node.get_attr('unscaled_h'):
             params['scale_index_height_type'] = 'scale_index_unscaled'
@@ -447,7 +456,10 @@ def format(self, node):
         params['min_height'] = params['in_height']
         params['min_width'] = params['in_width']
         params['instructions'] = '0'
-        params['fill_fn'] = 'FillConv2DBuffer'
+        if node.model.config.get_config_value('IOType') == 'io_parallel':
+            params['fill_fn'] = f'fill_buffer_{node.index}_pw'
+        else:
+            params['fill_fn'] = 'FillConv2DBuffer'
 
         if node.get_attr('unscaled_h'):
             params['scale_index_height_type'] = 'scale_index_unscaled'
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h
new file mode 100644
index 0000000000..d804af260c
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d.h
@@ -0,0 +1,46 @@
+#ifndef NNET_SEPARABLE_CONV1D_H_
+#define NNET_SEPARABLE_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d.h"
+#include "nnet_sepconv1d_latency.h"
+//#include "nnet_sepconv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv1D is not supported." && false);
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_1d_cl(data_T data[CONFIG_T::depthwise_config::in_width * CONFIG_T::depthwise_config::n_chan],
+                          res_T res[CONFIG_T::pointwise_config::out_width * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS INLINE recursive
+
+    dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_width * CONFIG_T::depthwise_config::n_filt];
+
+    depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_1d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h
new file mode 100644
index 0000000000..c9fe86ea93
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_latency.h
@@ -0,0 +1,86 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_acc = CONFIG_T::filt_width;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+                mult[i_in] =
+                    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                        cache, weights[i_in]);
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_acc; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h
new file mode 100644
index 0000000000..9ec638375d
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d.h
@@ -0,0 +1,51 @@
+#ifndef NNET_SEPARABLE_CONV2D_H_
+#define NNET_SEPARABLE_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d.h"
+#include "nnet_sepconv2d_latency.h"
+//#include "nnet_sepconv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
+    #pragma HLS INLINE recursive
+    if (CONFIG_T::strategy == nnet::latency) {
+        depthwise_conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
+    }
+}
+
+template <class data_T, class dw_res_T, class res_T, typename CONFIG_T>
+void separable_conv_2d_cl(data_T data[CONFIG_T::depthwise_config::in_height * CONFIG_T::depthwise_config::in_width *
+                                      CONFIG_T::depthwise_config::n_chan],
+                          res_T res[CONFIG_T::pointwise_config::out_height * CONFIG_T::pointwise_config::out_width *
+                                    CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::weight_t
+                              depthwise_weights[CONFIG_T::depthwise_config::filt_height *
+                                                CONFIG_T::depthwise_config::filt_width * CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::weight_t
+                              pointwise_weights[CONFIG_T::pointwise_config::n_chan * CONFIG_T::pointwise_config::n_filt],
+                          typename CONFIG_T::depthwise_config::bias_t depthwise_biases[CONFIG_T::depthwise_config::n_chan],
+                          typename CONFIG_T::pointwise_config::bias_t pointwise_biases[CONFIG_T::pointwise_config::n_filt]) {
+    #pragma HLS INLINE recursive
+
+    dw_res_T depthwise_res[CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width *
+                           CONFIG_T::depthwise_config::n_filt];
+
+    depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
+                                                                                depthwise_biases);
+    pointwise_conv_2d_cl<dw_res_T, res_T, typename CONFIG_T::pointwise_config>(depthwise_res, res, pointwise_weights,
+                                                                               pointwise_biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h
new file mode 100644
index 0000000000..161cc2c834
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_latency.h
@@ -0,0 +1,87 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_acc = CONFIG_T::filt_height * CONFIG_T::filt_width;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+                mult[i_in] =
+                    CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                        cache, weights[i_in]);
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_acc; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif

From 52252ca8cc5898b83fa0a7fc124bff18ca384ca8 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 13 May 2024 20:29:59 +0200
Subject: [PATCH 048/103] Cosmetic parameter config fixes

---
 hls4ml/backends/fpga/fpga_backend.py                   | 4 ++--
 hls4ml/backends/fpga/passes/codegen.py                 | 2 +-
 hls4ml/backends/vivado/passes/convolution_templates.py | 5 ++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 8d0ed64aad..87309ff4e5 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -685,7 +685,7 @@ def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, ke
 
         The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
         to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
-        the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
+        the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc),
         we need to do this for every convolution layer.
 
         Args:
@@ -782,7 +782,7 @@ def generate_conv2d_line_buffer_fn(
 
         The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
         to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
-        the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
+        the result depends on the parameters of the convolution layer (the input size, the kernel size, stride etc),
         we need to do this for every convolution layer.
 
         Args:
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/codegen.py
index b8f367137b..c951a02b80 100644
--- a/hls4ml/backends/fpga/passes/codegen.py
+++ b/hls4ml/backends/fpga/passes/codegen.py
@@ -78,7 +78,7 @@ def _generate_separable_im2col_1d(self, node):
             str(node.get_attr('index')) + '_pw',
             node.get_attr('n_partitions'),
             node.get_output_variable().shape[0],
-            node.get_output_variable().shape[1],
+            node.get_input_variable().shape[1],
             kernel=1,
         )
 
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 43a8b4fb7d..037f2d5eb2 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -320,6 +320,7 @@ def format(self, node):
 
         params['filt_width'] = 1
         params['stride_width'] = 1
+        params['pad_left'] = params['pad_right'] = 0
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('pointwise').nzeros
         params['index'] = str(node.index) + '_pointwise'
@@ -327,7 +328,7 @@ def format(self, node):
         params['min_width'] = params['in_width']
         params['instructions'] = '0'
         if node.model.config.get_config_value('IOType') == 'io_parallel':
-            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
+            params['fill_fn'] = f'fill_buffer_{node.index}_pw'
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
@@ -449,6 +450,8 @@ def format(self, node):
 
         params['filt_height'] = params['filt_width'] = 1
         params['stride_height'] = params['stride_width'] = 1
+        params['pad_left'] = params['pad_right'] = 0
+        params['pad_top'] = params['pad_bottom'] = 0
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('pointwise').nzeros
         params['index'] = str(node.index) + '_pointwise'

From be56b9347873600296d65aecb8a0ca115e212871 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 13 May 2024 20:32:07 +0200
Subject: [PATCH 049/103] Tests for SepConv io_parallel

---
 test/pytest/test_sepconv1d.py |  4 ++--
 test/pytest/test_sepconv2d.py | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 64b72db48a..e64bd06a76 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -12,9 +12,9 @@
 keras_conv1d = [SeparableConv1D]
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
-io_type_options = ['io_stream']
+io_type_options = ['io_parallel', 'io_stream']
 strides_options = [(1), (2)]
-kernel_options = [(1), (3)]
+kernel_options = [(2), (3)]
 bias_options = [False]
 
 
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 2fa2d94afe..da87488aa2 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -12,19 +12,19 @@
 keras_conv2d = [SeparableConv2D]
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
-io_type_options = ['io_stream']
+io_type_options = ['io_parallel', 'io_stream']
 strides_options = [(1, 1), (2, 2)]
 kernel_options = [(2, 2), (3, 3)]
 bias_options = [False]
 
 
-@pytest.mark.parametrize("conv2d", keras_conv2d)
-@pytest.mark.parametrize("chans", chans_options)
-@pytest.mark.parametrize("padds", padds_options)
-@pytest.mark.parametrize("strides", strides_options)
-@pytest.mark.parametrize("kernels", kernel_options)
-@pytest.mark.parametrize("bias", bias_options)
-@pytest.mark.parametrize("io_type", io_type_options)
+@pytest.mark.parametrize('conv2d', keras_conv2d)
+@pytest.mark.parametrize('chans', chans_options)
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('strides', strides_options)
+@pytest.mark.parametrize('kernels', kernel_options)
+@pytest.mark.parametrize('bias', bias_options)
+@pytest.mark.parametrize('io_type', io_type_options)
 @pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
 def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()

From b0085a11d38ab8f1ca8ec239e4b3b20128e4f64f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Apr 2024 20:11:49 +0000
Subject: [PATCH 050/103] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 24.4.0 → 24.4.2](https://github.com/psf/black/compare/24.4.0...24.4.2)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 275b349422..6db9312eb3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 24.4.0
+  rev: 24.4.2
   hooks:
   - id: black
     language_version: python3

From 44bc8f33f350c652ba2ae60edec7ad96f5d26d40 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 26 Apr 2024 18:23:10 -0500
Subject: [PATCH 051/103] Update pytest docker image to 0.5.4

---
 test/pytest/ci-template.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index 50e9f799f6..fa4e7c9d8a 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,10 +1,11 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.4.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.4.base
   tags:
     - k8s-default
   before_script:
     - source ~/.bashrc
+    - git config --global --add safe.directory /builds/fastmachinelearning/hls4ml
     - git submodule update --init --recursive hls4ml/templates/catapult/
     - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi
     - conda activate hls4ml-testing

From a7826e07425ab2ec703cbe1150485c99c9837198 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 1 May 2024 13:58:43 -0500
Subject: [PATCH 052/103] bump to 0.5.5

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index fa4e7c9d8a..afaf90da4d 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.4.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.5.base
   tags:
     - k8s-default
   before_script:

From 41ab6af33228f4c8a3ce3ea33dabf20811dbab96 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 25 Apr 2024 16:03:05 -0500
Subject: [PATCH 053/103] fix pre-commit warning

---
 test/pytest/test_weight_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_weight_writer.py b/test/pytest/test_weight_writer.py
index 168b781a67..431f10970b 100644
--- a/test/pytest/test_weight_writer.py
+++ b/test/pytest/test_weight_writer.py
@@ -29,5 +29,5 @@ def test_weight_writer(k, i, f):
     print(w_paths[0])
     assert len(w_paths) == 1
     w_loaded = np.loadtxt(w_paths[0], delimiter=',').reshape(1, 1)
-    print(f'{w[0,0]:.14}', f'{w_loaded[0,0]:.14}')
+    print(f'{w[0, 0]:.14}', f'{w_loaded[0, 0]:.14}')
     assert np.all(w == w_loaded)

From c0f8d9f3bd0be0ee340168c483c9995c705752fb Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Fri, 26 Apr 2024 18:35:04 -0500
Subject: [PATCH 054/103] change writing of obsolete ".h5" to ".keras" files

---
 hls4ml/writer/catapult_writer.py | 2 +-
 hls4ml/writer/quartus_writer.py  | 2 +-
 hls4ml/writer/vivado_writer.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
index 48d44e4a59..af3f28a59e 100755
--- a/hls4ml/writer/catapult_writer.py
+++ b/hls4ml/writer/catapult_writer.py
@@ -884,7 +884,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index f8f3d76188..8c0217f924 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -1322,7 +1322,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 412bb8d667..38b9de15f6 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -686,7 +686,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.h5'
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 

From bcfd6858d0ef3d1b575712f2acac6559274654fe Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 1 May 2024 20:21:44 +0200
Subject: [PATCH 055/103] Fix extension test for Keras v3

---
 docs/advanced/extension.rst    | 4 ++++
 test/pytest/test_extensions.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/docs/advanced/extension.rst b/docs/advanced/extension.rst
index ad86051d82..b6acc4ac6a 100644
--- a/docs/advanced/extension.rst
+++ b/docs/advanced/extension.rst
@@ -35,6 +35,10 @@ For concreteness, let's say our custom layer ``KReverse`` is implemented in Kera
         def call(self, inputs):
             return tf.reverse(inputs, axis=[-1])
 
+        def get_config(self):
+            return super().get_config()
+
+Make sure you define a ``get_config()`` method for your custom layer as this is needed for correct parsing.
 We can define the equivalent layer in hls4ml ``HReverse``, which inherits from ``hls4ml.model.layers.Layer``.
 
 .. code-block:: Python
diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py
index 0820a58c7c..bf5c7e2981 100644
--- a/test/pytest/test_extensions.py
+++ b/test/pytest/test_extensions.py
@@ -19,6 +19,10 @@ def __init__(self):
     def call(self, inputs):
         return tf.reverse(inputs, axis=[-1])
 
+    def get_config(self):
+        # Breaks serialization and parsing in hls4ml if not defined
+        return super().get_config()
+
 
 # hls4ml layer implementation
 class HReverse(hls4ml.model.layers.Layer):

From 8c0959567e92633bdfc95f71bbee0d8941d8eb29 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 15 May 2024 22:15:28 +0200
Subject: [PATCH 056/103] Support ParallelizationFactor in SepConv1D/2D

---
 hls4ml/backends/vivado/vivado_backend.py | 53 ++++++++++++++++++++----
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 47974e10c3..0b24393134 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -294,9 +294,20 @@ def init_sepconv1d(self, layer):
         else:
             layer.set_attr('strategy', 'latency')
 
-        layer.set_attr(
-            'n_partitions', 1
-        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        out_width = layer.get_output_variable().shape[0]
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(1, out_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            valid_pf_str = ','.join(map(str, valid_pf))
+            print(
+                f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
+                f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
+            )
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', out_width // closest_pf)
+
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
         # Set the output type of the depthwise phase
@@ -349,9 +360,21 @@ def init_sepconv2d(self, layer):
         else:
             layer.set_attr('strategy', 'latency')
 
-        layer.set_attr(
-            'n_partitions', 1
-        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        out_height = layer.get_output_variable().shape[0]
+        out_width = layer.get_output_variable().shape[1]
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(out_height, out_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            valid_pf_str = ','.join(map(str, valid_pf))
+            print(
+                f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
+                f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
+            )
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', out_height * out_width // closest_pf)
+
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
         # Set the output type of the depthwise phase
@@ -372,9 +395,21 @@ def init_depconv2d(self, layer):
         else:
             layer.set_attr('strategy', 'latency')
 
-        layer.set_attr(
-            'n_partitions', 1
-        )  # TODO Once we have SeparableConv implementation for io_parallel this should be set properly
+        out_height = layer.get_output_variable().shape[0]
+        out_width = layer.get_output_variable().shape[1]
+        chosen_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1)
+        valid_pf = self.get_valid_conv_partition_splits(out_height, out_width)
+        if chosen_pf not in valid_pf:
+            closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
+            valid_pf_str = ','.join(map(str, valid_pf))
+            print(
+                f'WARNING: Invalid ParallelizationFactor={chosen_pf} in layer "{layer.name}".'
+                f'Using ParallelizationFactor={closest_pf} instead. Valid ParallelizationFactor(s): {valid_pf_str}.'
+            )
+        else:
+            closest_pf = chosen_pf
+        layer.set_attr('n_partitions', out_height * out_width // closest_pf)
+
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
     def _set_pooling_accum_t(self, layer, pool_size):

From 11819acc24cc56868e7082928e1014fb020e3060 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 30 May 2024 18:51:16 -0500
Subject: [PATCH 057/103] updated pytest docker image

---
 test/pytest/ci-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index afaf90da4d..f6aa700415 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -1,6 +1,6 @@
 .pytest:
   stage: test
-  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.5.base
+  image: gitlab-registry.cern.ch/fastmachinelearning/hls4ml-testing:0.5.6.base
   tags:
     - k8s-default
   before_script:

From 39d923295daf792ba035fe2b761c938ff9cad935 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Mon, 3 Jun 2024 18:59:39 +0200
Subject: [PATCH 058/103] Don't test io_parallel for Catapult test and reduce
 the size of test to speed it up

---
 test/pytest/test_sepconv1d.py | 17 ++++++++++++-----
 test/pytest/test_sepconv2d.py | 16 ++++++++++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index e64bd06a76..a0d6abae4d 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -12,7 +12,6 @@
 keras_conv1d = [SeparableConv1D]
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
-io_type_options = ['io_parallel', 'io_stream']
 strides_options = [(1), (2)]
 kernel_options = [(2), (3)]
 bias_options = [False]
@@ -24,14 +23,22 @@
 @pytest.mark.parametrize('strides', strides_options)
 @pytest.mark.parametrize('kernels', kernel_options)
 @pytest.mark.parametrize('bias', bias_options)
-@pytest.mark.parametrize('io_type', io_type_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+@pytest.mark.parametrize(
+    'backend, io_type',
+    [
+        ('Vivado', 'io_parallel'),
+        ('Vitis', 'io_parallel'),
+        ('Vivado', 'io_stream'),
+        ('Vitis', 'io_stream'),
+        ('Catapult', 'io_stream'),
+    ],
+)
 def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
-    input_shape = (28, 3)
+    input_shape = (16, 3)
     model.add(
         conv1d(
-            filters=32,
+            filters=8,
             kernel_size=kernels,
             strides=strides,
             padding=padds,
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index da87488aa2..9c0ece575f 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -24,14 +24,22 @@
 @pytest.mark.parametrize('strides', strides_options)
 @pytest.mark.parametrize('kernels', kernel_options)
 @pytest.mark.parametrize('bias', bias_options)
-@pytest.mark.parametrize('io_type', io_type_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+@pytest.mark.parametrize(
+    'backend, io_type',
+    [
+        ('Vivado', 'io_parallel'),
+        ('Vitis', 'io_parallel'),
+        ('Vivado', 'io_stream'),
+        ('Vitis', 'io_stream'),
+        ('Catapult', 'io_stream'),
+    ],
+)
 def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
-    input_shape = (28, 28, 3)
+    input_shape = (16, 16, 3)
     model.add(
         conv2d(
-            filters=32,
+            filters=8,
             kernel_size=kernels,
             strides=strides,
             padding=padds,

From 68a83d636f4b74b4f082022160692128d2c8e028 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 9 Jun 2024 14:03:33 +0200
Subject: [PATCH 059/103] Add explicit DepthwiseConv tests and simpligy SepConv
 tests

---
 test/pytest/test_depthconv1d.py | 66 ++++++++++++++++++++++++++++++++
 test/pytest/test_depthconv2d.py | 67 +++++++++++++++++++++++++++++++++
 test/pytest/test_sepconv1d.py   | 11 ++----
 test/pytest/test_sepconv2d.py   | 11 ++----
 4 files changed, 141 insertions(+), 14 deletions(-)
 create mode 100644 test/pytest/test_depthconv1d.py
 create mode 100644 test/pytest/test_depthconv2d.py

diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py
new file mode 100644
index 0000000000..3734815af0
--- /dev/null
+++ b/test/pytest/test_depthconv1d.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+padds_options = ['same', 'valid']
+chans_options = ['channels_last']
+strides_options = [(1), (2)]
+kernel_options = [(2), (3)]
+bias_options = [False]
+
+
+@pytest.mark.parametrize('chans', chans_options)
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('strides', strides_options)
+@pytest.mark.parametrize('kernels', kernel_options)
+@pytest.mark.parametrize('bias', bias_options)
+@pytest.mark.parametrize(
+    'backend, io_type',
+    [
+        ('Vivado', 'io_parallel'),
+        ('Vitis', 'io_parallel'),
+        ('Vivado', 'io_stream'),
+        ('Vitis', 'io_stream'),
+        ('Catapult', 'io_stream'),
+    ],
+)
+def test_depthconv1d(chans, padds, strides, kernels, bias, io_type, backend):
+    model = tf.keras.models.Sequential()
+    input_shape = (16, 3)
+    model.add(
+        tf.keras.layers.DepthwiseConv1D(
+            kernel_size=kernels,
+            strides=strides,
+            padding=padds,
+            input_shape=input_shape,
+            kernel_initializer='normal',
+            use_bias=bias,
+            data_format=chans,
+        )
+    )
+
+    model.compile(optimizer='adam', loss='mse')
+    X_input = np.random.rand(100, *input_shape)
+    keras_prediction = model.predict(X_input)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>')
+    stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
+    kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
+    output_dir = str(
+        test_root_path
+        / 'hls4mlprj_depthconv1d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
+            chans, stride_cfg, kernel_cfg, padds, backend, io_type
+        )
+    )
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py
new file mode 100644
index 0000000000..9178edf368
--- /dev/null
+++ b/test/pytest/test_depthconv2d.py
@@ -0,0 +1,67 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+padds_options = ['same', 'valid']
+chans_options = ['channels_last']
+io_type_options = ['io_parallel', 'io_stream']
+strides_options = [(1, 1), (2, 2)]
+kernel_options = [(2, 2), (3, 3)]
+bias_options = [False]
+
+
+@pytest.mark.parametrize('chans', chans_options)
+@pytest.mark.parametrize('padds', padds_options)
+@pytest.mark.parametrize('strides', strides_options)
+@pytest.mark.parametrize('kernels', kernel_options)
+@pytest.mark.parametrize('bias', bias_options)
+@pytest.mark.parametrize(
+    'backend, io_type',
+    [
+        ('Vivado', 'io_parallel'),
+        ('Vitis', 'io_parallel'),
+        ('Vivado', 'io_stream'),
+        ('Vitis', 'io_stream'),
+        ('Catapult', 'io_stream'),
+    ],
+)
+def test_depthconv2d(chans, padds, strides, kernels, bias, io_type, backend):
+    model = tf.keras.models.Sequential()
+    input_shape = (16, 16, 3)
+    model.add(
+        tf.keras.layers.DepthwiseConv2D(
+            kernel_size=kernels,
+            strides=strides,
+            padding=padds,
+            input_shape=input_shape,
+            kernel_initializer='normal',
+            use_bias=bias,
+            data_format=chans,
+        )
+    )
+
+    model.compile(optimizer='adam', loss='mse')
+    X_input = np.random.rand(100, *input_shape)
+    keras_prediction = model.predict(X_input)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>')
+    stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
+    kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
+    output_dir = str(
+        test_root_path
+        / 'hls4mlprj_depthconv2d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
+            chans, stride_cfg, kernel_cfg, padds, backend, io_type
+        )
+    )
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.001)
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index a0d6abae4d..64312e9932 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -3,13 +3,11 @@
 import numpy as np
 import pytest
 import tensorflow as tf
-from tensorflow.keras.layers import SeparableConv1D
 
 import hls4ml
 
 test_root_path = Path(__file__).parent
 
-keras_conv1d = [SeparableConv1D]
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
 strides_options = [(1), (2)]
@@ -17,7 +15,6 @@
 bias_options = [False]
 
 
-@pytest.mark.parametrize('conv1d', keras_conv1d)
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides_options)
@@ -33,11 +30,11 @@
         ('Catapult', 'io_stream'),
     ],
 )
-def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend):
+def test_sepconv1d(chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (16, 3)
     model.add(
-        conv1d(
+        tf.keras.layers.SeparableConv1D(
             filters=8,
             kernel_size=kernels,
             strides=strides,
@@ -57,8 +54,8 @@ def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backen
     kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
         test_root_path
-        / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
-            conv1d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type
+        / 'hls4mlprj_sepconv1d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
+            chans, stride_cfg, kernel_cfg, padds, backend, io_type
         )
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 9c0ece575f..58e63fec8a 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -3,13 +3,11 @@
 import numpy as np
 import pytest
 import tensorflow as tf
-from tensorflow.keras.layers import SeparableConv2D
 
 import hls4ml
 
 test_root_path = Path(__file__).parent
 
-keras_conv2d = [SeparableConv2D]
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
 io_type_options = ['io_parallel', 'io_stream']
@@ -18,7 +16,6 @@
 bias_options = [False]
 
 
-@pytest.mark.parametrize('conv2d', keras_conv2d)
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides_options)
@@ -34,11 +31,11 @@
         ('Catapult', 'io_stream'),
     ],
 )
-def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
+def test_sepconv2d(chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (16, 16, 3)
     model.add(
-        conv2d(
+        tf.keras.layers.SeparableConv2D(
             filters=8,
             kernel_size=kernels,
             strides=strides,
@@ -58,8 +55,8 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backen
     kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
         test_root_path
-        / 'hls4mlprj_{}_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
-            conv2d.__name__.lower(), chans, stride_cfg, kernel_cfg, padds, backend, io_type
+        / 'hls4mlprj_sepconv2d_{}_strides_{}_kernels_{}_{}_padding_{}_{}'.format(
+            chans, stride_cfg, kernel_cfg, padds, backend, io_type
         )
     )
     hls_model = hls4ml.converters.convert_from_keras_model(

From 8a9d5568f42f80e631228d3647452715e1e97b6d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Jun 2024 20:06:27 +0000
Subject: [PATCH 060/103] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/asottile/pyupgrade: v3.15.2 → v3.16.0](https://github.com/asottile/pyupgrade/compare/v3.15.2...v3.16.0)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6db9312eb3..aa9e58da38 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,7 +30,7 @@ repos:
     args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.15.2
+  rev: v3.16.0
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]

From ad86387dc2aebcb78a2097965c19bb479aa8da09 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 19 Apr 2024 10:31:04 +0200
Subject: [PATCH 061/103] Initial commit

---
 .../passes/fifo_depth_optimization.py         | 69 +++++++++++++++++++
 .../vitis_accelerator/supported_boards.json   | 28 ++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py

diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
new file mode 100644
index 0000000000..e983ca49fb
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
@@ -0,0 +1,69 @@
+# from hls4ml.backends.vivado.passes.fifo_depth_optimization import (
+#     generate_max_depth_file,
+#     get_vcd_data,
+#     populate_values,
+#     set_big_fifos,
+#     set_fifo_depth,
+# )
+# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+#     def __init__(self):
+#         self.values = []
+
+#     def transform(self, model):
+#         # use `large_fifo_depth = 0` to keep the default fifo depth
+#         profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000)
+
+#         # check axi-stream or io-stream, if not one the 2 exit
+#         if not (
+#             model.config.get_config_value('IOType') == 'io_stream'
+#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream'
+#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master'
+#         ):
+#             raise Exception(
+#                 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config '
+#                 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field'
+#             )
+
+#         # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be
+#         # profiled
+
+#         if profiling_fifo_depth:
+#             set_big_fifos(model.output_vars, profiling_fifo_depth)
+
+#         data = get_vcd_data(model)
+
+#         for i in range(1, len(data['children'][0]['children'][0]['children'])):
+#             # wrapper fifos
+#             populate_values(
+#                 self.values,
+#                 data['children'][0]['children'][0]['children'][i]['name'],
+#                 data['children'][0]['children'][0]['children'][i]['children'][0]['data'],
+#                 data['children'][0]['children'][0]['children'][i]['children'][1]['data'],
+#             )
+
+#         n_elem = len(data['children'][0]['children'][0]['children'][0]['children'])
+#         for i in range(n_elem):
+#             name = data['children'][0]['children'][0]['children'][0]['children'][i]['name']
+#             data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data']
+#             depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data']
+#             populate_values(self.values, name, data_p, depth)
+
+#         maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values]
+
+#         generate_max_depth_file(model, maxs)
+
+#         set_fifo_depth(model, maxs)
+
+#         inp = model.get_input_variables()[0]
+#         out = model.get_output_variables()[0]
+#         for x in maxs:
+#             if 'in_local' in x['name']:
+#                 inp.pragma = (inp.pragma[0], x['max'] + 1)
+#             elif 'out_local' in x['name']:
+#                 out.pragma = (out.pragma[0], x['max'] + 1)
+
+#         print('[hls4ml] - FIFO optimization completed')
+#         return False
diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator/supported_boards.json
index 4a54ea2924..1279ec22d0 100644
--- a/hls4ml/backends/vitis_accelerator/supported_boards.json
+++ b/hls4ml/backends/vitis_accelerator/supported_boards.json
@@ -10,5 +10,33 @@
     "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
     "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
     "c_drivers": {}
+  },
+  "alveo-u50": {
+    "part": "xcu50-fsvh2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u250": {
+    "part": "xcu250-figd2104-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u200": {
+    "part": "xcu200-fsgd2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u280": {
+    "part": "xcu280-fsvh2892-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
   }
 }

From 4ea329be3c1f75d579db5e82f56f1284cee0b651 Mon Sep 17 00:00:00 2001
From: dgburnette <36940078+dgburnette@users.noreply.github.com>
Date: Mon, 15 Apr 2024 07:12:17 -0700
Subject: [PATCH 062/103] Stage initial set of changes for the Catapult backend
 (#956)

* Stage initial set of changes for the Catapult backend

* applied some changes for issues reported by pre-commit. But pre-commit still reorders backends/__init__.py incorrectly

* final changes for clean pre-commit

* minor edits

* Checkin

* Add file

* pre-commit format

* add in nnet_utils files

* format changes for pre-commit

* run flows by netlist type

* update design pragmas on some blocks. cleaned up TCL script

* move AC submodules under hls4ml/templates/catapult

* merged in latest changes from mainline

* remove bad submodules

* recreate AC submodules in hls4ml/templates/catapult

* pre-commit fixes

* pre-commit fixes

* turn on Catapult backend testing

* removed io_parallel testing for Catapult backend

* add Catapult

* added Catapult

* added Catapult

* added Catapult to some pytests

* Added concept of ProjectDir to distinguish the project directory of the HLS tool from the ProjectName which is used for the cpp file and top function name

* better handling of c++ testbench data files. enhanced directory naming.

* fix syntax

* workaround from Giuseppe

* Add concept of ProjectDir for Catapult which is different from ProjectName that gets used for the top function name and the cpp files

* add new file from Giuseppe

* improvements to project management, reporting and testbench

* include new file in generation of parameters.h

* add hard_tanh for io_parallel. formatting

* Full path to the header nnet_helpers.h is necessary in the include (check if this is not an issue with recent versions of Catapult)

* Avoid ceiling function from the math library: ceil(n/d) ---> (n+d-1)/n

* These are mostly workarounds for the BUP synyhesis of a testing model (should these changes make in something more general?)

* revert format back to what clang-format currently enforces

* simplification from Giuesspe

* Fixes for bottom-up handling of libraries

* pre-commit format fixes

* fix loops

* consolidate prj scripts

* cleanup pragmas

* switch from using ssh to https for submodules

* fix include path for non-catapult install

* update pytest environment

* CL 1100381

* CL 1098112

* roll in latest changes. turn off Catapult variants of test_binary_cnn and test_cnn_mnist_qkeras for now

* fix test failure

* disable Catapult test for pytorch until it is supported

* disable Catapult for pytorch tests

* Simpler submodule initialization for CI

---------

Co-authored-by: David Burnette <hlslibs@mentor.com>
Co-authored-by: Giuseppe Di Guglielmo <gdg@fnal.gov>
Co-authored-by: Jovan Mitrevski <jmitrevs@fnal.gov>
Co-authored-by: Vladimir Loncar <vloncar@users.noreply.github.com>
---
 hls4ml/backends/catapult/catapult_backend.py  |   3 +-
 .../catapult/passes/transform_types.py        |   6 +-
 hls4ml/backends/fpga/fpga_types.py            | 177 ++++++++++++++++++
 hls4ml/writer/catapult_writer.py              |   2 +-
 test/pytest/ci-template.yml                   |   1 -
 test/pytest/test_cnn_mnist.py                 |   2 +-
 test/pytest/test_sepconv1d.py                 |  14 +-
 test/pytest/test_sepconv2d.py                 |  25 +--
 8 files changed, 196 insertions(+), 34 deletions(-)

diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
index 0583e80dab..5556154dcb 100644
--- a/hls4ml/backends/catapult/catapult_backend.py
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -4,8 +4,7 @@
 import numpy as np
 
 from hls4ml.backends import FPGABackend
-from hls4ml.backends.catapult.catapult_types import CatapultArrayVariableConverter
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, CatapultArrayVariableConverter, HLSTypeConverter
 from hls4ml.model.attributes import ChoiceAttribute, ConfigurableAttribute, TypeAttribute
 from hls4ml.model.flow import register_flow
 from hls4ml.model.layers import (
diff --git a/hls4ml/backends/catapult/passes/transform_types.py b/hls4ml/backends/catapult/passes/transform_types.py
index 3cbb917a67..4ef3548cb6 100755
--- a/hls4ml/backends/catapult/passes/transform_types.py
+++ b/hls4ml/backends/catapult/passes/transform_types.py
@@ -1,10 +1,12 @@
-from hls4ml.backends.catapult.catapult_types import (
+from hls4ml.backends.fpga.fpga_types import (
+    ACTypeConverter,
     CatapultArrayVariableConverter,
     CatapultInplaceArrayVariableConverter,
     CatapultInplaceStreamVariableConverter,
     CatapultStreamVariableConverter,
+    HLSTypeConverter,
+    StaticWeightVariableConverter,
 )
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, HLSTypeConverter, StaticWeightVariableConverter
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceTensorVariable
 
diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
index 15ad386c5a..408f1320e4 100644
--- a/hls4ml/backends/fpga/fpga_types.py
+++ b/hls4ml/backends/fpga/fpga_types.py
@@ -234,6 +234,42 @@ def definition_cpp(self, name_suffix='', as_reference=False):
 # region ArrayVariable
 
 
+class VivadoArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class QuartusArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}] {pragma}'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
+        )
+
+
+class CatapultArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}] /* {pragma} */'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp(), pragma=self.pragma
+        )
+
+
+class VivadoInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class QuartusInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class CatapultInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
 class ArrayVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -251,11 +287,59 @@ def convert(self, tensor_var, pragma='partition'):
         return tensor_var
 
 
+class VivadoArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoArrayVariableDefinition)
+
+
+class QuartusArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusArrayVariableDefinition)
+
+
+class CatapultArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultArrayVariableDefinition)
+
+
+class VivadoInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceArrayVariableDefinition)
+
+
+class QuartusInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceArrayVariableDefinition
+        )
+
+
+class CatapultInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceArrayVariableDefinition
+        )
+
+
 # endregion
 
 # region StructMemberVariable
 
 
+class QuartusStructMemberVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class CatapultStructMemberVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.member_name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
 class StructMemberVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -278,11 +362,68 @@ def convert(self, tensor_var, pragma='partition', struct_name=None):
         return tensor_var
 
 
+class QuartusStructMemberVariableConverter(StructMemberVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStructMemberVariableDefinition
+        )
+
+
+class CatapultStructMemberVariableConverter(StructMemberVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStructMemberVariableDefinition
+        )
+
+
 # endregion
 
 # region StreamVariable
 
 
+class VivadoStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'hls::stream<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration
+            return 'hls::stream<{type}> {name}{suffix}("{name}")'.format(
+                type=self.type.name, name=self.name, suffix=name_suffix
+            )
+
+
+class VivadoInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class QuartusStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'stream<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration
+            return f'stream<{self.type.name}> {self.name}{name_suffix}'
+
+
+class QuartusInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class CatapultStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'ac_channel<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration (string name arg not implemented in ac_channel)
+            return 'ac_channel<{type}> {name}{suffix}/*("{name}")*/'.format(
+                type=self.type.name, name=self.name, suffix=name_suffix
+            )
+
+
+class CatapultInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
 class StreamVariableConverter:
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -304,6 +445,21 @@ def convert(self, tensor_var, n_pack=1, depth=0):
         return tensor_var
 
 
+class VivadoStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)
+
+
+class QuartusStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
+
+
+class CatapultStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Catapult', definition_cls=CatapultStreamVariableDefinition)
+
+
 # endregion
 
 # region InplaceStreamVariable
@@ -323,6 +479,27 @@ def convert(self, tensor_var, n_pack=1, depth=0):
         return tensor_var
 
 
+class VivadoInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Vivado', definition_cls=VivadoInplaceStreamVariableDefinition
+        )
+
+
+class QuartusInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Quartus', definition_cls=QuartusInplaceStreamVariableDefinition
+        )
+
+
+class CatapultInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Catapult', definition_cls=CatapultInplaceStreamVariableDefinition
+        )
+
+
 # endregion
 
 # region WeightsVariable
diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
index af3f28a59e..48d44e4a59 100755
--- a/hls4ml/writer/catapult_writer.py
+++ b/hls4ml/writer/catapult_writer.py
@@ -884,7 +884,7 @@ def write_yml(self, model):
         """
 
         def keras_model_representer(dumper, keras_model):
-            model_path = model.config.get_output_dir() + '/keras_model.keras'
+            model_path = model.config.get_output_dir() + '/keras_model.h5'
             keras_model.save(model_path)
             return dumper.represent_scalar('!keras_model', model_path)
 
diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index f6aa700415..a9391709cb 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -5,7 +5,6 @@
     - k8s-default
   before_script:
     - source ~/.bashrc
-    - git config --global --add safe.directory /builds/fastmachinelearning/hls4ml
     - git submodule update --init --recursive hls4ml/templates/catapult/
     - if [ $EXAMPLEMODEL == 1 ]; then git submodule update --init example-models; fi
     - conda activate hls4ml-testing
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
index ab3365f228..27b966f51d 100644
--- a/test/pytest/test_cnn_mnist.py
+++ b/test/pytest/test_cnn_mnist.py
@@ -61,7 +61,7 @@ def keras_model(mnist_data):
         ('Vitis', 'io_parallel', 'resource'),
         ('Vitis', 'io_parallel', 'latency'),
         ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
+        ('Vitis', 'io_stream', 'latency'),
     ],
 )
 def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 64312e9932..1afdfbae67 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -20,17 +20,9 @@
 @pytest.mark.parametrize('strides', strides_options)
 @pytest.mark.parametrize('kernels', kernel_options)
 @pytest.mark.parametrize('bias', bias_options)
-@pytest.mark.parametrize(
-    'backend, io_type',
-    [
-        ('Vivado', 'io_parallel'),
-        ('Vitis', 'io_parallel'),
-        ('Vivado', 'io_stream'),
-        ('Vitis', 'io_stream'),
-        ('Catapult', 'io_stream'),
-    ],
-)
-def test_sepconv1d(chans, padds, strides, kernels, bias, io_type, backend):
+@pytest.mark.parametrize('io_type', io_type_options)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+def test_sepconv1d(conv1d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (16, 3)
     model.add(
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 58e63fec8a..bee2227a86 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -16,22 +16,15 @@
 bias_options = [False]
 
 
-@pytest.mark.parametrize('chans', chans_options)
-@pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('strides', strides_options)
-@pytest.mark.parametrize('kernels', kernel_options)
-@pytest.mark.parametrize('bias', bias_options)
-@pytest.mark.parametrize(
-    'backend, io_type',
-    [
-        ('Vivado', 'io_parallel'),
-        ('Vitis', 'io_parallel'),
-        ('Vivado', 'io_stream'),
-        ('Vitis', 'io_stream'),
-        ('Catapult', 'io_stream'),
-    ],
-)
-def test_sepconv2d(chans, padds, strides, kernels, bias, io_type, backend):
+@pytest.mark.parametrize("conv2d", keras_conv2d)
+@pytest.mark.parametrize("chans", chans_options)
+@pytest.mark.parametrize("padds", padds_options)
+@pytest.mark.parametrize("strides", strides_options)
+@pytest.mark.parametrize("kernels", kernel_options)
+@pytest.mark.parametrize("bias", bias_options)
+@pytest.mark.parametrize("io_type", io_type_options)
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Catapult'])
+def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backend):
     model = tf.keras.models.Sequential()
     input_shape = (16, 16, 3)
     model.add(

From 992b9b766a79ec6f49a65b9655a4e7c46e949452 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:49:18 +0200
Subject: [PATCH 063/103] Rudimentary optimizer to infer 'auto' precision

---
 .../model/optimizer/passes/infer_precision.py | 211 ++++--------------
 1 file changed, 43 insertions(+), 168 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 51422c534e..5ef1c2dee5 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -1,23 +1,13 @@
-import math
+from copy import deepcopy
 
 import numpy as np
 
-from hls4ml.model.optimizer import ConfigurableOptimizerPass
+from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import FixedPrecisionType, UnspecifiedPrecisionType
 
-# TODO:  The code assumes everything is Fixed or Integer precision. Need to add checks
-
-
-class InferPrecisionTypes(ConfigurableOptimizerPass):
-    def __init__(self):
-        # The option, infer_no_bias, allows you to tailor for the given weights, in particular, zero bias
-        self.infer_no_bias = False
 
+class InferPrecisionTypes(OptimizerPass):
     def match(self, node):
-        input_var = node.get_input_variable()
-        if input_var is not None and isinstance(input_var.type, UnspecifiedPrecisionType):
-            # only infer types if the input type is known
-            return False
         for layer_type in node.types.values():
             if isinstance(layer_type.precision, UnspecifiedPrecisionType):
                 return True
@@ -34,16 +24,14 @@ def transform(self, model, node):
             if type_name not in inferred_types:
                 self._infer_default_type(node, type_name)
 
-        # if the return type was set, this may allow InferPrecisionTypes to be run
-        # on layers it was not previously able to
-        return 'result_t' in types_to_infer
+        return False  # No model graph changes made
 
     def _infer_precision(self, node, types_to_infer):
         node_class = node.class_name
         if node_class in ['Dense']:
             return self._infer_dense_precision(node, types_to_infer)
 
-        if node_class in ['BatchNormalization', 'ApplyAlpha']:
+        if node_class in ['BatchNormalization']:
             return self._infer_bn_precision(node, types_to_infer)
 
         if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']:
@@ -58,24 +46,14 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['Clone', 'Reshape', 'Resize', 'Transpose', 'ZeroPadding1D', 'ZeroPadding2D']:
             return self._infer_output_matching_precision(node, types_to_infer)
 
-        if node_class in ['Merge']:
+        if node_class in ['Concatenate', 'Merge']:
             return self._infer_merge_precision(node, types_to_infer)
 
-        if node_class in ['Concatenate']:
-            return self._infer_cat_precision(node, types_to_infer)
-
-        if node_class in ['Dot']:
-            return self._infer_dot_precision(node, types_to_infer)
-
         # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
         # this in config_from_* functions
 
         return []
 
-    def _get_default_precision(self, node):
-        model_config = node.model.config
-        return model_config.backend.convert_precision_string(model_config.model_precision['default'])
-
     def _infer_default_type(self, node, type_name):
         model_config = node.model.config
         default_precision = model_config.backend.convert_precision_string(model_config.model_precision['default'])
@@ -98,51 +76,47 @@ def _infer_common_precision(self, node, types_to_infer, n_ops):
         input_precision = node.get_input_variable().type.precision
         input_width = input_precision.width
         input_integers = input_precision.integer
-        input_signed = input_precision.signed
 
         if 'weight_t' in types_to_infer:
             weight_quantizer = node.get_attr('weight_quantizer', None)
             if weight_quantizer is not None:
+                weight_width = weight_quantizer.bits
+                weight_integers = weight_quantizer.hls_type.integer
                 node.types['weight_t'].name = node.name + '_weight_t'
                 node.types['weight_t'].precision = weight_quantizer.hls_type
             else:
                 self._infer_default_type(node, 'weight_t')
+                weight_width = node.types['weight_t'].precision.width
+                weight_integers = node.types['weight_t'].precision.integer
             node.weights['weight'].update_precision(node.types['weight_t'].precision)
-            inferred_types.append('weight_t')
 
-        weight_width = node.types['weight_t'].precision.width
-        weight_integers = node.types['weight_t'].precision.integer
-        weight_signed = node.types['weight_t'].precision.signed
+            inferred_types.append('weight_t')
+        else:
+            weight_width = node.types['weight_t'].precision.width
+            weight_integers = node.types['weight_t'].precision.integer
 
         if 'bias_t' in types_to_infer:
             bias_quantizer = node.get_attr('bias_quantizer', None)
             if bias_quantizer is not None:
+                bias_width = bias_quantizer.bits
+                bias_integers = bias_quantizer.hls_type.integer
                 node.types['bias_t'].name = node.name + '_bias_t'
                 node.types['bias_t'].precision = bias_quantizer.hls_type
             else:
                 self._infer_default_type(node, 'bias_t')
+                bias_width = node.types['bias_t'].precision.width
+                bias_integers = node.types['bias_t'].precision.integer
             node.weights['bias'].update_precision(node.types['bias_t'].precision)
-            inferred_types.append('bias_t')
-
-        bias_width = node.types['bias_t'].precision.width
-        bias_integers = node.types['bias_t'].precision.integer
-        bias_signed = node.types['bias_t'].precision.signed
-        no_bias = node.weights['bias'].nonzeros == 0 and self.infer_no_bias  # no bias
 
-        # using math.ceil instead of np.ceil because it returns an int
-        bitwidth = weight_width + input_width + math.ceil(np.log2(n_ops))
-        integers = weight_integers + input_integers + math.ceil(np.log2(n_ops))
-        signed = weight_signed or input_signed
-
-        frac = bitwidth - integers
-
-        if not no_bias:
-            integers = max(integers + (bias_signed and not signed), bias_integers + (signed and not bias_signed)) + 1
-            bitwidth = integers + max(frac, bias_width - bias_integers)
-            signed = signed or bias_signed
+            inferred_types.append('bias_t')
+        else:
+            bias_width = node.types['bias_t'].precision.width
+            bias_integers = node.types['bias_t'].precision.integer
 
-        # Note:  this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form.
-        new_type = FixedPrecisionType(bitwidth, integers, signed)
+        new_type = FixedPrecisionType(
+            width=int(max(np.ceil(input_width + weight_width + np.log2(n_ops)), bias_width) + 1),
+            integer=int(max(np.ceil(input_integers + weight_integers + np.log2(n_ops)), bias_integers) + 1),
+        )
 
         if 'accum_t' in types_to_infer:
             node.types['accum_t'].name = node.name + '_accum_t'
@@ -159,7 +133,7 @@ def _infer_common_precision(self, node, types_to_infer, n_ops):
         return inferred_types
 
     def _infer_dense_precision(self, node, types_to_infer):
-        n_ops = node.get_attr('n_in')
+        n_ops = node.get_attr('n_in') * node.get_attr('n_out')
         return self._infer_common_precision(node, types_to_infer, n_ops)
 
     def _infer_conv_precision(self, node, types_to_infer):
@@ -243,11 +217,6 @@ def _infer_sepconv_precision(self, node, types_to_infer):
         return inferred_types
 
     def _infer_bn_precision(self, node, types_to_infer):
-        """
-        The batchnormalziation precision here is the more implementation-focused version. It propagates
-        precision from scale and bias, not mean, variance, etc.
-        """
-
         inferred_types = []
 
         if 'scale_t' in types_to_infer:
@@ -261,28 +230,16 @@ def _infer_bn_precision(self, node, types_to_infer):
             inferred_types.append('bias_t')
 
         if 'result_t' in types_to_infer:
-            input_precision = node.get_input_variable().type.precision
             scale_precision = node.types['scale_t'].precision
             bias_precision = node.types['bias_t'].precision
 
-            after_scale_signed = scale_precision.signed or input_precision.signed
-            after_scale_width = input_precision.width + scale_precision.width
-            after_scale_integer = input_precision.integer + scale_precision.integer
-
-            out_precision_signed = after_scale_signed or bias_precision.signed
-            out_precision_integer = (
-                max(
-                    after_scale_integer + (bias_precision.signed and not after_scale_signed),
-                    bias_precision.integer + (after_scale_signed and not bias_precision.signed),
-                )
-                + 1
-            )
-            out_precision_width = out_precision_integer + max(
-                after_scale_width - after_scale_integer, bias_precision.fractional
-            )
+            out_precision = deepcopy(node.get_input_node().get_output_variable().type.precision)
+            out_precision.integer += scale_precision.integer
+            out_precision.fractional = max(out_precision.fractional, scale_precision.fractional)
 
-            # Note:  this is guaranteed to not overflow or need rounding, so it's sufficient to use the simpler form.
-            out_precision = FixedPrecisionType(out_precision_width, out_precision_integer, out_precision_signed)
+            out_precision.integer = max(out_precision.integer, bias_precision.integer) + 1
+            out_precision.fractional = max(out_precision.fractional, bias_precision.fractional)
+            out_precision.width = out_precision.fractional + out_precision.integer
 
             node.types['result_t'].name = node.name + '_result_t'
             node.types['result_t'].precision = out_precision
@@ -296,21 +253,15 @@ def _infer_pooling_precision(self, node, types_to_infer):
 
         if 'accum_t' in types_to_infer:
             input_precision = node.get_input_variable().type.precision
-            pool_op = node.attributes['pool_op'].lower()
-
-            width = input_precision.width
-            integer = input_precision.integer
-            signed = input_precision.signed
+            input_width = input_precision.width
+            input_integers = input_precision.integer
 
-            pool_size = node.get_attr('pool_height', 1) * node.get_attr('pool_width')
-            if pool_op == 'average':
-                extra_bits = int(np.ceil(np.log2(pool_size)))
-            elif pool_op == 'max':
-                extra_bits = 0
-            else:
-                raise ValueError(f'Unknown pooling operation: {pool_op}')
+            n_ops = node.get_attr('n_filt') * node.get_attr('pool_height', 1) * node.get_attr('pool_width')
 
-            accum_type = FixedPrecisionType(width=width + extra_bits * 2, integer=integer + extra_bits, signed=signed)
+            accum_type = FixedPrecisionType(
+                width=int(np.ceil(input_width + np.log2(n_ops)) + 1),
+                integer=int(np.ceil(input_integers + np.log2(n_ops)) + 1),
+            )
 
             node.types['accum_t'].name = node.name + '_accum_t'
             node.types['accum_t'].precision = accum_type
@@ -329,86 +280,10 @@ def _infer_merge_precision(self, node, types_to_infer):
         input_1 = node.get_input_variable(node.inputs[0]).type.precision
         input_2 = node.get_input_variable(node.inputs[1]).type.precision
 
-        op = node.get_attr('op').lower()
-        if op in ('add', 'subtract', 'average'):
-            new_signed = input_1.signed or input_2.signed or op == 'subtract'
-            new_int = (
-                max(
-                    input_1.integer + (input_2.signed and not input_1.signed),
-                    input_2.integer + (input_1.signed and not input_2.signed),
-                )
-                + 1
-            )
-            new_width = new_int + max(input_1.fractional, input_2.fractional)
-            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
-        elif op == 'multiply':
-            new_signed = input_1.signed or input_2.signed
-            new_int = input_1.integer + input_2.integer
-            new_width = input_1.width + input_2.width
-            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
-        elif op in ('maximum', 'minimum'):
-            new_signed = input_1.signed or input_2.signed
-
-            input_1_integer = input_1.integer
-            input_2_integer = input_2.integer
-
-            # add one to integer if unsigned while new is signed
-            if new_signed and not input_1.signed:
-                input_1_integer += 1
-            if new_signed and not input_2.signed:
-                input_2_integer += 1
-
-            new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer)
-            new_int = max(input_1_integer, input_2_integer)
-            out_precision = FixedPrecisionType(new_width, new_int, new_signed)
-        else:
-            print(f'Warning: not propagating weights for type {op}')
-            out_precision = self._get_default_precision(node)
-
-        node.types['result_t'].name = node.name + '_result_t'
-        node.types['result_t'].precision = out_precision
-
-        return ['result_t']
-
-    def _infer_cat_precision(self, node, types_to_infer):
-        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
-
-        input_1 = node.get_input_variable(node.inputs[0]).type.precision
-        input_2 = node.get_input_variable(node.inputs[1]).type.precision
-
-        new_signed = input_1.signed or input_2.signed
-
-        input_1_integer = input_1.integer
-        input_2_integer = input_2.integer
-
-        # add one to integer if unsigned while new is signed
-        if new_signed and not input_1.signed:
-            input_1_integer += 1
-        if new_signed and not input_2.signed:
-            input_2_integer += 1
-
-        new_width = max(input_1.fractional, input_2.fractional) + max(input_1_integer, input_2_integer)
-        new_int = max(input_1_integer, input_2_integer)
-
-        out_precision = FixedPrecisionType(new_width, new_int, new_signed)
-        node.types['result_t'].name = node.name + '_result_t'
-        node.types['result_t'].precision = out_precision
-
-        return ['result_t']
-
-    def _infer_dot_precision(self, node, types_to_infer):
-        assert 'result_t' in types_to_infer and len(types_to_infer) == 1
-
-        input_1 = node.get_input_variable(node.inputs[0]).type.precision
-        input_2 = node.get_input_variable(node.inputs[1]).type.precision
-
-        n_in = node.get_input_variable(node.inputs[0]).shape[0]
-
-        new_signed = input_1.signed or input_2.signed
-        new_width = input_1.width + input_2.width + math.ceil(np.log2(n_in))
-        new_int = input_1.integer + input_2.integer + math.ceil(np.log2(n_in))
+        new_width = max(input_1.fractional, input_2.fractional) + max(input_1.integer, input_2.integer)
+        new_int = max(input_1.integer, input_2.integer)
 
-        out_precision = FixedPrecisionType(new_width, new_int, new_signed)
+        out_precision = FixedPrecisionType(new_width, new_int)
         node.types['result_t'].name = node.name + '_result_t'
         node.types['result_t'].precision = out_precision
 

From 8174465998afd6689bcb3b59d70472c7b52278bd Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Sun, 20 Aug 2023 22:50:18 +0200
Subject: [PATCH 064/103] Sepconv fixes

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 037f2d5eb2..4b46546971 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -286,10 +286,7 @@ def format(self, node):
         params['index'] = str(node.index) + '_depthwise'
         params['weight_t'] = node.get_weights('depthwise').type
         params['bias_t'] = node.get_weights('zero_bias').type
-        if node.model.config.get_config_value('IOType') == 'io_parallel':
-            params['fill_fn'] = f'fill_buffer_{node.index}_dw'
-        else:
-            params['fill_fn'] = 'FillConv1DBuffer'
+        params['fill_fn'] = 'FillConv1DBuffer'
 
         if node.get_attr('unscaled'):
             params['scale_index_type'] = 'scale_index_unscaled'

From 84ff2c6b21ef0f74c9585111a0518ff0049636cd Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:39:00 +0100
Subject: [PATCH 065/103] Optimizer to remove expensive Transpose that serves
 as Flatten

---
 hls4ml/model/optimizer/__init__.py            |  2 --
 .../passes/convert_to_channels_last.py        | 29 +++++--------------
 2 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 3aa247d03f..247e799ec6 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -36,8 +36,6 @@
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',
-        'remove_nop_transpose',
-        'remove_single_channel_transpose',
         'fuse_bias_add',
         'expand_layer_group',
         'output_rounding_saturation_mode',
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index 01e949086e..c283e28c92 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -144,38 +144,23 @@ def match(self, node):
         if node.model.config.get_config_value('IOType') != 'io_parallel':
             return False
 
-        if hasattr(node, '_channels_last_keep_transpose') and node._channels_last_keep_transpose:
-            return False
-
         if isinstance(node, Reshape):
             input_node = node.get_input_node()
             output_nodes = node.get_output_nodes()
-            if (
-                len(node.get_attr('target_shape')) == 1
-                and isinstance(input_node, Transpose)
-                and len(output_nodes) == 1
-                and isinstance(output_nodes[0], Dense)
-            ):
+            if len(node.get_attr('target_shape')) == 1 and isinstance(input_node, Transpose) \
+                and len(output_nodes) == 1 and isinstance(output_nodes[0], Dense):
                 return True
-
+        
         return False
-
+    
     def transform(self, model, node):
         transpose_node = node.get_input_node()
         dense_node = node.get_output_nodes()[0]
         input_shape = transpose_node.get_output_variable().shape
 
-        if len(input_shape) == 2:  # Usually after Conv1D
-            tran_axis = [1, 0, 2]
-        elif len(input_shape) == 3:  # Usually after Conv2D
-            tran_axis = [1, 2, 0, 3]
-        else:  # In this case we bail
-            node._channels_last_keep_transpose = True
-            return False
-
         weight_var = dense_node.get_weights('weight')
         # Transpose the weights to achieve the same computation with transposed input
-        weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(*tran_axis)
+        weight_data_t = weight_var.data.reshape(*input_shape, -1).transpose(1, 2, 0, 3)
         weight_data_t = weight_data_t.reshape(-1, weight_data_t.shape[-1])
         new_weight_var = WeightVariable(
             var_name=weight_var.name,
@@ -183,9 +168,9 @@ def transform(self, model, node):
             precision=weight_var.type.precision,
             quantizer=weight_var.quantizer,
             data=weight_data_t,
-            index=dense_node.index,
+            index=dense_node.index
         )
-
+        
         # Update the weight variable of the node
         dense_node.set_attr('weight', new_weight_var)
 

From 518796d12f1209c4798e1ee934a4a66e0a130971 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 27 Feb 2024 21:09:35 +0100
Subject: [PATCH 066/103] Remove transpose of input if n_chan=1

---
 hls4ml/model/optimizer/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 247e799ec6..3aa247d03f 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -36,6 +36,8 @@
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',
+        'remove_nop_transpose',
+        'remove_single_channel_transpose',
         'fuse_bias_add',
         'expand_layer_group',
         'output_rounding_saturation_mode',

From 238e35cf7595bf9e6b4d2a4a87d2f4d187c9add2 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Wed, 21 Feb 2024 21:39:00 +0100
Subject: [PATCH 067/103] Optimizer to remove expensive Transpose that serves
 as Flatten

---
 hls4ml/model/optimizer/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 3aa247d03f..247e799ec6 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -36,8 +36,6 @@
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',
-        'remove_nop_transpose',
-        'remove_single_channel_transpose',
         'fuse_bias_add',
         'expand_layer_group',
         'output_rounding_saturation_mode',

From c10dd8212eaeed54fbb726614373ccb84db4c11b Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Tue, 27 Feb 2024 21:09:35 +0100
Subject: [PATCH 068/103] Remove transpose of input if n_chan=1

---
 hls4ml/model/optimizer/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 247e799ec6..3aa247d03f 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -36,6 +36,8 @@
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',
+        'remove_nop_transpose',
+        'remove_single_channel_transpose',
         'fuse_bias_add',
         'expand_layer_group',
         'output_rounding_saturation_mode',

From d6fe369a31154a7d7fbebff72d873257a6a569fd Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Thu, 13 Jun 2024 15:21:12 -0500
Subject: [PATCH 069/103] fix up automatic precision inferrence

---
 .../model/optimizer/passes/infer_precision.py |   9 +-
 .../optimizer/passes/seperable_to_dw_conv.py  | 127 ++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 hls4ml/model/optimizer/passes/seperable_to_dw_conv.py

diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index 5ef1c2dee5..0b323abc35 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -37,7 +37,10 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']:
             return self._infer_conv_precision(node, types_to_infer)
 
-        if node_class in ['SeparableConv1D', 'SeparableConv2D', 'DepthwiseConv2D']:
+        if node_class in ['DepthwiseConv1D', 'DepthwiseConv2D']:
+            return self._infer_depthconv_precision(node, types_to_infer)
+
+        if node_class in ['SeparableConv1D', 'SeparableConv2D']:
             return self._infer_sepconv_precision(node, types_to_infer)
 
         if node_class in ['Pooling1D', 'Pooling2D']:
@@ -140,6 +143,10 @@ def _infer_conv_precision(self, node, types_to_infer):
         n_ops = node.get_attr('n_chan') * node.get_attr('filt_height', 1) * node.get_attr('filt_width')
         return self._infer_common_precision(node, types_to_infer, n_ops)
 
+    def _infer_depthconv_precision(self, node, types_to_infer):
+        n_ops = node.get_attr('filt_height', 1) * node.get_attr('filt_width')
+        return self._infer_common_precision(node, types_to_infer, n_ops)
+
     def _infer_sepconv_precision(self, node, types_to_infer):
         inferred_types = []
 
diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
new file mode 100644
index 0000000000..0142f686d0
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
@@ -0,0 +1,127 @@
+"""
+This optimizer converts a seperable convolution to a depthwise followed by a regular convolution.
+For backends with a custom pointwise implementations the regular convolution will subsequently
+be converted to a pointwise convolution by a different optimizer.
+"""
+
+import copy
+
+from hls4ml.model.layers import SeparableConv1D, SeparableConv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class SeperableToDepthwiseAndConv(OptimizerPass):
+    """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)"""
+
+    _dw_attributes = (
+        'in_width',
+        'out_width',
+        'n_chan',
+        'depth_multiplier',
+        'pad_left',
+        'pad_right',
+        'filt_width',
+        'stride_width',
+        'dilation_width',
+        'in_height',
+        'out_height',
+        'pad_top',
+        'pad_bottom',
+        'filt_height',
+        'stride_height',
+        'dilation_height',
+        'data_format',
+        'depthwise_data',
+        'depthwise_quantizer',
+        'padding',
+    )
+
+    _pw_attributes = ('out_width', 'n_filt', 'dilation_width', 'out_height', 'dilation_height', 'data_format', 'use_bias')
+
+    def match(self, node):
+        return isinstance(node, (SeparableConv1D, SeparableConv2D))
+
+    def transform(self, model, node):
+        dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+
+        # get the layer configuration name
+        layer_config = model.config.get_layer_config(node)
+
+        # First do depthwise
+        dw_name = f'{node.name}_depthwise'
+
+        # now the layer config (so that set configuration get copied)
+        dw_layer_config = copy.deepcopy(layer_config)
+
+        if dw_layer_config:
+            dw_precision_cfg = dw_layer_config.setdefault('Precision', {})
+            if isinstance(dw_precision_cfg, dict):
+                if 'depthwise' in dw_precision_cfg:
+                    dw_precision_cfg['weight'] = dw_precision_cfg['depthwise']
+                    del dw_precision_cfg['depthwise']
+                if 'depthwise_accum' in dw_precision_cfg:
+                    dw_precision_cfg['accum'] = dw_precision_cfg['depthwise_accum']
+                    del dw_precision_cfg['depthwise_accum']
+                if 'depthwise_result' in dw_precision_cfg:
+                    dw_precision_cfg['result'] = dw_precision_cfg['depthwise_result']
+                    del dw_precision_cfg['depthwise_result']
+                dw_precision_cfg.pop('pointwise', None)
+                dw_precision_cfg.pop('pointwise_accum', None)
+            model.config.set_name_config(dw_name, dw_layer_config)
+            model.config.parse_name_config(dw_name, dw_layer_config)
+
+        # creating the attributes
+        dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes}
+
+        dw_attributes['use_bias'] = False
+
+        new_dw = model.make_node('DepthwiseConv' + dim, dw_name, dw_attributes, [node.inputs[0]])
+
+        # Then do convolution
+        pw_name = f'{node.name}_pointwise'
+
+        # now the layer config (so that set configuration get copied)
+        pw_layer_config = copy.deepcopy(layer_config)
+
+        if pw_layer_config:
+            pw_precision_cfg = pw_layer_config.setdefault('Precision', {})
+            if isinstance(pw_precision_cfg, dict):
+                if 'pointwise' in pw_precision_cfg:
+                    pw_precision_cfg['weight'] = pw_precision_cfg['pointwise']
+                    del pw_precision_cfg['pointwise']
+                if 'pointwise_accum' in pw_precision_cfg:
+                    pw_precision_cfg['accum'] = pw_precision_cfg['pointwise_accum']
+                    del pw_precision_cfg['pointwise_accum']
+                if 'pointwise_result' in pw_precision_cfg:
+                    pw_precision_cfg['result'] = pw_precision_cfg['pointwise_result']
+                    del pw_precision_cfg['pointwise_result']
+                pw_precision_cfg.pop('depthwise', None)
+                pw_precision_cfg.pop('depthwise_accum', None)
+            model.config.set_name_config(pw_name, pw_layer_config)
+            model.config.parse_name_config(pw_name, pw_layer_config)
+
+        # creating the attributes
+        pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes}
+        pw_attributes['filt_width'] = 1
+        pw_attributes['filt_height'] = 1
+        pw_attributes['stride_width'] = 1
+        pw_attributes['stride_height'] = 1
+        pw_attributes['pad_left'] = 0
+        pw_attributes['pad_right'] = 0
+        pw_attributes['pad_top'] = 0
+        pw_attributes['pad_bottom'] = 0
+        pw_attributes['in_width'] = pw_attributes['out_width']
+        pw_attributes['in_height'] = pw_attributes.get('out_height', 1)
+        pw_attributes['n_chan'] = node.get_attr('n_chan') * node.get_attr('depth_multiplier')
+        pw_attributes['weight_data'] = node.get_attr('pointwise_data')
+        pw_attributes['weight_quantizer'] = node.get_attr('pointwise_quantizer')
+        pw_attributes['bias_data'] = node.get_attr('bias_data')
+        pw_attributes['bias_quantizer'] = node.get_attr('bias_quantizer')
+
+        # note this is just regular convolution. It is replaced by a special pointwise implementation
+        # if available by another optimizer
+        new_pw = model.make_node('Conv' + dim, pw_name, pw_attributes, [dw_name])
+
+        model.split_node(node, new_dw, new_pw)
+
+        return True

From 7290a29167c2e044912a3c6c8ea326f1621a41ea Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Mon, 10 Jun 2024 19:13:15 -0500
Subject: [PATCH 070/103] starting towards being able to split seperable

---
 hls4ml/backends/fpga/fpga_backend.py     | 10 ++++
 hls4ml/backends/vivado/vivado_backend.py |  6 ---
 hls4ml/model/graph.py                    | 62 ++++++++++++++----------
 hls4ml/model/layers.py                   | 16 ++++++
 4 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 87309ff4e5..672627e35f 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -79,6 +79,16 @@ def __init__(self, name):
             attrs.append(ConfigurableAttribute('reuse_factor', default=1))
             self.attribute_map[layer] = attrs
 
+        # seperable is kind of special because it is effectively two layers that will be split
+        for layer in (SeparableConv1D, SeparableConv2D):
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(TypeAttribute('depthwise_accum'))
+            attrs.append(TypeAttribute('pointwise_accum'))
+            attrs.append(TypeAttribute('depthwise_result'))
+            attrs.append(ConfigurableAttribute('depthwise_reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('pointwise_reuse_factor', default=1))
+            self.attribute_map[layer] = attrs
+
         act_attrs = self.attribute_map.get(Activation, [])
         act_attrs.append(ConfigurableAttribute('table_size', default=1024))
         act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 0b24393134..b80c6664af 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -76,12 +76,6 @@ def _register_layer_attributes(self):
             attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
             self.attribute_map[layer] = attrs
 
-        sep_conv_layers = [SeparableConv1D, SeparableConv2D]
-        for layer in sep_conv_layers:
-            attrs = self.attribute_map.get(layer, [])
-            attrs.append(TypeAttribute('dw_output', default=FixedPrecisionType(18, 8)))
-            self.attribute_map[layer] = attrs
-
     def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 04ec33294d..d1722eaae1 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -100,6 +100,12 @@ def get_layer_config(self, layer):
 
         return layer_config
 
+    def set_name_config(self, name, config):
+        """sets hls_config["LayerName"][name] = config"""
+        hls_config = self.config['HLSConfig']
+        layer_config = hls_config.setdefault('LayerName', {})
+        layer_config[name] = config
+
     def get_precision(self, layer, var='default'):
         precision = self.layer_name_precision.get(layer.name.lower() + '_' + var)
         type_name = layer.name.lower() + '_' + var + '_t'
@@ -183,6 +189,35 @@ def get_compression(self, layer):
 
         return compression
 
+    def parse_name_config(self, layer_name, layer_cfg):
+        """This is used by _parse_hls_config below, but also in optimizers when a new layer config is created"""
+        precision_cfg = layer_cfg.get('Precision')
+        if isinstance(precision_cfg, dict):
+            for var, precision in precision_cfg.items():
+                self.layer_name_precision[layer_name.lower() + '_' + var] = precision
+        else:
+            self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg
+
+        rf = layer_cfg.get('ReuseFactor')
+        if rf is not None:
+            self.layer_name_rf[layer_name.lower()] = rf
+
+        targ_cycles = layer_cfg.get('TargetCycles')
+        if targ_cycles is not None:
+            self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles
+
+        strategy = layer_cfg.get('Strategy')
+        if strategy is not None:
+            self.layer_name_strategy[layer_name.lower()] = strategy
+
+        conv_implementation = layer_cfg.get('ConvImplementation')
+        if conv_implementation is not None:
+            self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation
+
+        compression = layer_cfg.get('Compression')
+        if compression is not None:
+            self.layer_name_compression[layer_name.lower()] = bool(compression)
+
     def _parse_hls_config(self):
         hls_config = self.config['HLSConfig']
 
@@ -255,32 +290,7 @@ def _parse_hls_config(self):
         layer_name_cfg = hls_config.get('LayerName')
         if layer_name_cfg is not None:
             for layer_name, layer_cfg in layer_name_cfg.items():
-                precision_cfg = layer_cfg.get('Precision')
-                if isinstance(precision_cfg, dict):
-                    for var, precision in precision_cfg.items():
-                        self.layer_name_precision[layer_name.lower() + '_' + var] = precision
-                else:
-                    self.layer_name_precision[layer_name.lower() + '_default'] = precision_cfg
-
-                rf = layer_cfg.get('ReuseFactor')
-                if rf is not None:
-                    self.layer_name_rf[layer_name.lower()] = rf
-
-                targ_cycles = layer_cfg.get('TargetCycles')
-                if targ_cycles is not None:
-                    self.layer_name_targ_cycles[layer_name.lower()] = targ_cycles
-
-                strategy = layer_cfg.get('Strategy')
-                if strategy is not None:
-                    self.layer_name_strategy[layer_name.lower()] = strategy
-
-                conv_implementation = layer_cfg.get('ConvImplementation')
-                if conv_implementation is not None:
-                    self.layer_name_conv_implementation[layer_name.lower()] = conv_implementation
-
-                compression = layer_cfg.get('Compression')
-                if compression is not None:
-                    self.layer_name_compression[layer_name.lower()] = bool(compression)
+                self.parse_name_config(layer_name, layer_cfg)
 
     def _validate_hls_config(self):
         use_dataflow = False
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 0d9cc0622c..f076a1e5f0 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -100,6 +100,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None):
 
         layer_config = self.model.config.get_layer_config(self)
         for config_key, config_value in layer_config.items():
+            print(f'{config_key=}, {config_value=}')
             config_key = convert_to_snake_case(config_key)
             if config_key in self.attributes:
                 print(
@@ -179,6 +180,12 @@ def _set_accum_t(self):
             accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
             self.set_attr('accum_t', accum_t)
 
+    def _set_type_t(self, name):
+        has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute))
+        if has_type_t:
+            type_t = NamedType(*reversed(self.model.config.get_precision(self, name)))
+            self.set_attr(name + '_t', type_t)
+
     def get_input_node(self, input_name=None):
         if input_name is None:
             if len(self.inputs) > 0:
@@ -470,6 +477,11 @@ def initialize(self):
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
 
+        # set the needed types if needed
+        self._set_type_t('pointwise_accum')
+        self._set_type_t('depthwise_accum')
+        self._set_type_t('depthwise_result')
+
 
 class DepthwiseConv1D(Conv1D):
     def initialize(self):
@@ -616,6 +628,10 @@ def initialize(self):
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
 
+        self._set_type_t('pointwise_accum')
+        self._set_type_t('depthwise_accum')
+        self._set_type_t('depthwise_result')
+
 
 class DepthwiseConv2D(Conv2D):
     def initialize(self):

From 13fcf0a0c16ea380fad65bf59daaa533029cf68e Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Tue, 11 Jun 2024 19:27:07 -0500
Subject: [PATCH 071/103] complete implementation of seperable -> dw + pw,
 untested

---
 .../vivado/passes/convolution_templates.py    |  2 +-
 hls4ml/converters/keras/convolution.py        |  3 +
 hls4ml/model/graph.py                         | 38 +++++++++++++
 hls4ml/model/layers.py                        | 56 +++++++++++++++++--
 hls4ml/model/optimizer/__init__.py            |  1 +
 5 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 4b46546971..36ec0aa475 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -280,7 +280,7 @@ def format(self, node):
         # Override bias and bias_t since these are zeros in depthwise step of SepConv1D
         params['bias'] = params['zero_bias']
         params['bias_t'] = params['zero_bias_t']
-        params['n_filt'] = params['n_chan']  # In depthwise step n_chan == n_filt
+        params['n_filt'] = params['n_chan'] * node.get_attr('depth_multiplier')  # In depthwise step n_chan == n_filt
         params['dilation'] = node.get_attr('dilation', 1)
         params['nzeros'] = node.get_weights('depthwise').nzeros
         params['index'] = str(node.index) + '_depthwise'
diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py
index 39780f6dc6..0eaa967844 100644
--- a/hls4ml/converters/keras/convolution.py
+++ b/hls4ml/converters/keras/convolution.py
@@ -60,6 +60,9 @@ def parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader):
 
     layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias')
 
+    if 'depth_multiplier' in keras_layer['config']:
+        layer['depth_multiplier'] = keras_layer['config']['depth_multiplier']
+
     if 'filters' in keras_layer['config']:
         layer['n_filt'] = keras_layer['config']['filters']
     else:
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index d1722eaae1..10b3a0f854 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -615,6 +615,44 @@ def replace_node(self, old_node, new_node):
         self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items())
         self._update_model_outputs()
 
+    def split_node(self, old_node, new_node1, new_node2):
+        """Replace an existing node in the graph with two nodes in sequence.
+
+        Args:
+            old_node (Layer): The node to replace
+            new_node1 (Layer): The first new node in sequence
+            new_node2 (Layer): The second new node in sequence
+
+        """
+
+        # fmt: off
+        assert len(new_node1.inputs) == len(old_node.inputs), \
+            f'{new_node1.name} and {old_node.name} have different number of inputs'
+        assert len(new_node2.outputs) == len(old_node.outputs), \
+            f'{new_node2.name} and {old_node.name} have different number of outputs'
+        # fmt: on
+
+        repl = {old_name: new_name for old_name, new_name in zip(old_node.outputs, new_node2.outputs)}
+        repl.update({old_name: new_name for old_name, new_name in zip(old_node.inputs, new_node1.inputs)})
+
+        for node in self.graph.values():
+            for i, n in enumerate(node.inputs):
+                if n in repl:
+                    node.inputs[i] = repl[n]
+            for i, n in enumerate(node.outputs):
+                if n in repl:
+                    node.outputs[i] = repl[n]
+
+        new_graph = OrderedDict()
+        for key, value in self.graph.items():
+            if key == old_node.name:
+                new_graph[new_node1.name] = new_node1
+                new_graph[new_node2.name] = new_node2
+            else:
+                new_graph[key] = value
+        self.graph = new_graph
+        self._update_model_outputs()
+
     def _update_model_outputs(self):
         '''Update the model outputs
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index f076a1e5f0..9e80da291f 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -447,6 +447,7 @@ class SeparableConv1D(Layer):
         Attribute('out_width'),
         Attribute('n_chan'),
         Attribute('n_filt'),
+        Attribute('depth_multiplier', default=1),
         Attribute('filt_width'),
         Attribute('stride_width'),
         Attribute('pad_left'),
@@ -484,12 +485,27 @@ def initialize(self):
 
 
 class DepthwiseConv1D(Conv1D):
+    _expected_attributes = [
+        Attribute('in_width'),
+        Attribute('out_width'),
+        Attribute('n_chan'),
+        Attribute('depth_multiplier', default=1),
+        Attribute('filt_width'),
+        Attribute('stride_width'),
+        Attribute('pad_left'),
+        Attribute('pad_right'),
+        WeightAttribute('depthwise'),
+        WeightAttribute('bias'),
+        TypeAttribute('depthwise'),
+        TypeAttribute('bias'),
+    ]
+
     def initialize(self):
         if self.get_attr('data_format') == 'channels_last':
-            shape = [self.attributes['out_width'], self.attributes['n_chan']]
+            shape = [self.attributes['out_width'], self.attributes['n_chan'] * self.attributes['depth_multiplier']]
             dims = [f'OUT_HEIGHT_{self.index}', f'N_CHAN_{self.index}']
         else:
-            shape = [self.attributes['n_chan'], self.attributes['out_width']]
+            shape = [self.attributes['n_chan'] * self.attributes['depth_multiplier'], self.attributes['out_width']]
             dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
         self.add_output_variable(shape, dims)
 
@@ -498,6 +514,7 @@ def initialize(self):
         )
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
+        self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier'))
 
 
 class Conv2D(Layer):
@@ -594,6 +611,7 @@ class SeparableConv2D(Layer):
         Attribute('out_width'),
         Attribute('n_chan'),
         Attribute('n_filt'),
+        Attribute('depth_multiplier', default=1),
         Attribute('filt_height'),
         Attribute('filt_width'),
         Attribute('stride_height'),
@@ -634,12 +652,41 @@ def initialize(self):
 
 
 class DepthwiseConv2D(Conv2D):
+    _expected_attributes = [
+        Attribute('in_height'),
+        Attribute('in_width'),
+        Attribute('out_height'),
+        Attribute('out_width'),
+        Attribute('n_chan'),
+        Attribute('depth_multiplier', default=1),
+        Attribute('filt_height'),
+        Attribute('filt_width'),
+        Attribute('stride_height'),
+        Attribute('stride_width'),
+        Attribute('pad_top'),
+        Attribute('pad_bottom'),
+        Attribute('pad_left'),
+        Attribute('pad_right'),
+        WeightAttribute('weight'),
+        WeightAttribute('bias'),
+        TypeAttribute('weight'),
+        TypeAttribute('bias'),
+    ]
+
     def initialize(self):
         if self.get_attr('data_format') == 'channels_last':
-            shape = [self.attributes['out_height'], self.attributes['out_width'], self.attributes['n_chan']]
+            shape = [
+                self.attributes['out_height'],
+                self.attributes['out_width'],
+                self.attributes['n_chan'] * self.attributes['depth_multiplier'],
+            ]
             dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
         else:
-            shape = [self.attributes['n_chan'], self.attributes['out_height'], self.attributes['out_width']]
+            shape = [
+                self.attributes['n_chan'] * self.attributes['depth_multiplier'],
+                self.attributes['out_height'],
+                self.attributes['out_width'],
+            ]
             dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
         self.add_output_variable(shape, dims)
 
@@ -648,6 +695,7 @@ def initialize(self):
         )
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
+        self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier'))
 
 
 class Pooling1D(Layer):
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 3aa247d03f..de1b7597df 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -33,6 +33,7 @@
 register_flow(
     'convert',
     [
+        'seperable_to_depthwise_and_conv',  # has to be before precision inference
         'infer_precision_types',
         'channels_last_converter',
         'remove_transpose_before_flatten',

From 92e722272dfd2b8162ca003d562a800f8f09c98e Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 12 Jun 2024 18:28:08 -0500
Subject: [PATCH 072/103] make conv_same_pad also trigger on depthwise, varius
 bug fixes

---
 hls4ml/backends/vivado/passes/conv_same_pad.py | 6 +++---
 hls4ml/model/layers.py                         | 5 ++---
 test/pytest/test_sepconv2d.py                  | 5 +++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py
index bb8354a3d0..dd282f34e3 100644
--- a/hls4ml/backends/vivado/passes/conv_same_pad.py
+++ b/hls4ml/backends/vivado/passes/conv_same_pad.py
@@ -1,4 +1,4 @@
-from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
+from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 
 
@@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass):
 
     def match(self, node):
         is_match = (
-            isinstance(node, (Conv1D, SeparableConv1D))
+            isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D))
             and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal'))
             and node.get_attr('filt_width') != 1
         )
@@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass):
 
     def match(self, node):
         is_match = (
-            isinstance(node, (Conv2D, SeparableConv2D))
+            isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D))
             and node.get_attr('padding') == 'same'
             and node.get_attr('filt_height') != 1
             and node.get_attr('filt_width') != 1
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 9e80da291f..cb826bb8a1 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -100,7 +100,6 @@ def __init__(self, model, name, attributes, inputs, outputs=None):
 
         layer_config = self.model.config.get_layer_config(self)
         for config_key, config_value in layer_config.items():
-            print(f'{config_key=}, {config_value=}')
             config_key = convert_to_snake_case(config_key)
             if config_key in self.attributes:
                 print(
@@ -494,9 +493,9 @@ class DepthwiseConv1D(Conv1D):
         Attribute('stride_width'),
         Attribute('pad_left'),
         Attribute('pad_right'),
-        WeightAttribute('depthwise'),
+        WeightAttribute('weight'),
         WeightAttribute('bias'),
-        TypeAttribute('depthwise'),
+        TypeAttribute('weight'),
         TypeAttribute('bias'),
     ]
 
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index bee2227a86..4c46e7ab57 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -10,7 +10,6 @@
 
 padds_options = ['same', 'valid']
 chans_options = ['channels_last']
-io_type_options = ['io_parallel', 'io_stream']
 strides_options = [(1, 1), (2, 2)]
 kernel_options = [(2, 2), (3, 3)]
 bias_options = [False]
@@ -43,7 +42,9 @@ def test_sepconv2d(conv2d, chans, padds, strides, kernels, bias, io_type, backen
     model.compile(optimizer='adam', loss='mse')
     X_input = np.random.rand(100, *input_shape)
     keras_prediction = model.predict(X_input)
-    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32,16>')
+    config = hls4ml.utils.config_from_keras_model(
+        model, default_precision='ap_fixed<32,16>', granularity="name", backend=backend
+    )
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     kernel_cfg = str(kernels).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(

From f12a7ea94e981cef23d059c564c0ff46cb3330f9 Mon Sep 17 00:00:00 2001
From: Jovan Mitrevski <jmitrevs@fnal.gov>
Date: Wed, 12 Jun 2024 20:58:51 -0500
Subject: [PATCH 073/103] add parsing of depth multiplier for 1D depthwise conv

---
 hls4ml/converters/keras/convolution.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hls4ml/converters/keras/convolution.py b/hls4ml/converters/keras/convolution.py
index 0eaa967844..2b24613094 100644
--- a/hls4ml/converters/keras/convolution.py
+++ b/hls4ml/converters/keras/convolution.py
@@ -21,6 +21,9 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader):
 
     layer['bias_data'] = get_weights_data(data_reader, layer['name'], 'bias')
 
+    if 'depth_multiplier' in keras_layer['config']:
+        layer['depth_multiplier'] = keras_layer['config']['depth_multiplier']
+
     if 'filters' in keras_layer['config']:
         layer['n_filt'] = keras_layer['config']['filters']
     else:

From e2d270ea3c05ebfc7e08a290b39edeb38c58aef1 Mon Sep 17 00:00:00 2001
From: stzelepi <stylianos.tzelepis@cern.ch>
Date: Mon, 26 Aug 2024 17:30:53 +0200
Subject: [PATCH 074/103] Finish resolving conficts with main

---
 .../backends/vivado/passes/conv_same_pad.py   |   6 +-
 hls4ml/model/layers.py                        |   2 -
 .../vivado/nnet_utils/nnet_sepconv_stream.h   | 234 +-----------------
 3 files changed, 12 insertions(+), 230 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/conv_same_pad.py b/hls4ml/backends/vivado/passes/conv_same_pad.py
index dd282f34e3..bb8354a3d0 100644
--- a/hls4ml/backends/vivado/passes/conv_same_pad.py
+++ b/hls4ml/backends/vivado/passes/conv_same_pad.py
@@ -1,4 +1,4 @@
-from hls4ml.model.layers import Conv1D, Conv2D, DepthwiseConv1D, DepthwiseConv2D, SeparableConv1D, SeparableConv2D
+from hls4ml.model.layers import Conv1D, Conv2D, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 
 
@@ -7,7 +7,7 @@ class InsertZeroPaddingBeforeConv1D(OptimizerPass):
 
     def match(self, node):
         is_match = (
-            isinstance(node, (Conv1D, DepthwiseConv1D, SeparableConv1D))
+            isinstance(node, (Conv1D, SeparableConv1D))
             and ((node.get_attr('padding') == 'same') or (node.get_attr('padding') == 'causal'))
             and node.get_attr('filt_width') != 1
         )
@@ -55,7 +55,7 @@ class InsertZeroPaddingBeforeConv2D(OptimizerPass):
 
     def match(self, node):
         is_match = (
-            isinstance(node, (Conv2D, DepthwiseConv2D, SeparableConv2D))
+            isinstance(node, (Conv2D, SeparableConv2D))
             and node.get_attr('padding') == 'same'
             and node.get_attr('filt_height') != 1
             and node.get_attr('filt_width') != 1
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 024c2233cd..d8d1fb9c8f 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -520,7 +520,6 @@ def initialize(self):
         )
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
-        self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier'))
 
 
 class Conv2D(Layer):
@@ -702,7 +701,6 @@ def initialize(self):
         )
 
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
-        self.set_attr('n_filt', self.get_attr('n_chan') * self.get_attr('depth_multiplier'))
 
 
 class Pooling1D(Layer):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
index dea028d53b..9c16de1908 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv_stream.h
@@ -4,210 +4,14 @@
 #include "hls_stream.h"
 #include "nnet_common.h"
 #include "nnet_conv_stream.h"
-#include <iostream>
 
 namespace nnet {
 
 template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_resource_rf_leq_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
+void depthwise_product(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
                        typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                        typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-                    
-    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
-    const int nout = CONFIG_T::n_chan;
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
-    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
-    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
-    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
-    // const int multscale = multiplier_limit;
-
-    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
-
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
-    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    // std::cout << "LEQ IMPLE" << std::endl;
-
-InitAccum:  
-    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-int out_index = 0;
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        int in_index = ir;
-        out_index = in_index % CONFIG_T::n_chan;
-        // int w_index = ir;
-        // int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
-
-            in_index+=rufactor;
-
-            out_index+=rufactor;
-            out_index -= ((out_index) >= CONFIG_T::n_chan)*CONFIG_T::n_chan;
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < nout; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-
-template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_resource_rf_gt_nchan_rem0(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
-                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-                    
-    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
-    const int nout = CONFIG_T::n_chan;
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
-    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
-    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
-    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
-    // const int multscale = multiplier_limit;
-
-    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
-
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
-    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    // std::cout << "REM0 IMPLE" << std::endl;
-
-InitAccum:  
-    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-int out_index = 0;
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        int in_index = ir;
-        // int w_index = ir;
-        // int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
-
-            in_index+=rufactor;         
-        }
-        out_index++;
-        out_index -= ((out_index) == CONFIG_T::n_chan)*CONFIG_T::n_chan;
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < nout; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_resource_rf_gt_nchan(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
-                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-                    
-    const int nin = CONFIG_T::kernel_size * CONFIG_T::n_chan;
-    const int nout = CONFIG_T::n_chan;
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, nin);
-    // const int multfactor = MIN(nin, CONFIG_T::reuse_factor);
-    // const int multiplier_limit = DIV_ROUNDUP(nin, multfactor);
-    const int block_factor = DIV_ROUNDUP(nin, CONFIG_T::reuse_factor);
-    // const int multscale = multiplier_limit;
-
-    // assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    // assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_RESHAPE   variable=data block factor=block_factor
-
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
-    #pragma HLS ARRAY_PARTITION variable=acc factor=block_factor
-    // std::cout << "GT IMPLE" << std::endl;
-
-InitAccum:  
-    for (int iacc = 0; iacc < CONFIG_T::n_chan; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-int out_index = 0;
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        int in_index = ir;
-        // int w_index = ir;
-        // int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-
-            out_index = in_index % CONFIG_T::n_chan;
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(data[in_index], weights[in_index]));
-
-            in_index+=rufactor;
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < nout; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-
-template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
-                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-    // #pragma HLS INLINE
+    #pragma HLS INLINE
 
     typename CONFIG_T::accum_t mult[CONFIG_T::kernel_size * CONFIG_T::n_chan];
     typename CONFIG_T::accum_t acc[CONFIG_T::n_chan];
@@ -239,10 +43,8 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c
 // Accumulate multiplication result
 Accum1:
     for (int ii = 0; ii < CONFIG_T::kernel_size; ii++) {
-        // #pragma HLS PIPELINE II=1 rewind
     Accum2:
         for (int jj = 0; jj < CONFIG_T::n_chan; jj++) {
-            // #pragma HLS UNROLL
             int index = ii * CONFIG_T::n_chan + jj;
             acc[jj] += mult[index];
         }
@@ -256,22 +58,6 @@ void depthwise_product_latency(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_c
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void depthwise_product_resource(data_T data[CONFIG_T::kernel_size * CONFIG_T::n_chan], res_T res[CONFIG_T::n_chan],
-                       typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_chan]) {
-
-    #pragma HLS INLINE recursive
-
-    if (CONFIG_T::reuse_factor < CONFIG_T::n_chan) {
-        depthwise_product_resource_rf_leq_nchan<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_chan == 0) {
-        depthwise_product_resource_rf_gt_nchan_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        depthwise_product_resource_rf_gt_nchan<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
 template <class data_T, class res_T, typename CONFIG_T>
 void depthwise_mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::kernel_size * CONFIG_T::n_chan],
                            res_T &res_pack, hls::stream<res_T> &res_stream, unsigned &outputs_ready,
@@ -292,9 +78,9 @@ void depthwise_mult_buffer(hls::stream<typename data_T::value_type> data_window[
 
     #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
-        depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+        depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
     } else {
-        depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+        assert("Resource strategy for DepthwiseConv2D is not supported." && false);
     }
 
 CastLoop:
@@ -416,11 +202,10 @@ void compute_depthwise_output_buffer_1d(const data_T &in_elem, hls::stream<res_T
         // Dense multiply
         #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
-            depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
                                                                                                  weights, biases);
         } else {
-            depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
-                                                                                                 weights, biases);
+            assert("Resource strategy for DepthwiseConv1D is not supported." && false);
         }
 
     // Pack output
@@ -482,11 +267,10 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
         // Dense multiply
         #pragma HLS INLINE recursive
         if (CONFIG_T::strategy == nnet::latency) {
-            depthwise_product_latency<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
+            depthwise_product<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
                                                                                                  weights, biases);
         } else {
-            depthwise_product_resource<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(kernel_data, res_out,
-                                                                                                 weights, biases);
+            assert("Resource strategy for DepthwiseConv2D is not supported." && false);
         }
 
     // Pack output
@@ -519,4 +303,4 @@ void compute_depthwise_output_buffer_2d(const data_T &in_elem,
 }
 
 } // namespace nnet
-#endif
\ No newline at end of file
+#endif

From fa6bd665bc4c735285ff42619615943e67c27d40 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Mon, 18 Nov 2024 11:30:22 +0100
Subject: [PATCH 075/103] Supress removing tar for now

---
 hls4ml/writer/vitis_accelerator_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index 70573bb5c2..306de31bb8 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -378,7 +378,7 @@ def write_driver(self, model):
         )
 
     def write_new_tar(self, model):
-        os.remove(model.config.get_output_dir() + '.tar.gz')
+        # os.remove(model.config.get_output_dir() + '.tar.gz')
         super().write_tar(model)
 
     def write_hls(self, model):

From b42210d64cf66bd85e816403bb263bfcf7a9ce60 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Mon, 18 Nov 2024 17:14:04 +0100
Subject: [PATCH 076/103] Fix csynth and cosim

---
 .../templates/vitis_accelerator/build_lib.sh  |   4 +
 .../vivado/nnet_utils/nnet_helpers.h          | 106 +++++++++++++-----
 hls4ml/writer/vitis_accelerator_writer.py     |  15 ++-
 3 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator/build_lib.sh
index 69a2bace57..db929714cf 100644
--- a/hls4ml/templates/vitis_accelerator/build_lib.sh
+++ b/hls4ml/templates/vitis_accelerator/build_lib.sh
@@ -6,7 +6,11 @@ if [[ "$OSTYPE" == "linux-gnu" ]]; then
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     CFLAGS="-O3 -fPIC -std=c++11"
 fi
+VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR"
+CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS"
+
 INCFLAGS="-Ifirmware/ap_types/"
+
 PROJECT=myproject
 LIB_STAMP=mystamp
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 3938af347c..88a6561f7d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -2,7 +2,6 @@
 #define NNET_HELPERS_H
 
 #include "hls_stream.h"
-#include "ap_axi_sdata.h"
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -12,6 +11,9 @@
 #include <stdlib.h>
 #include <vector>
 
+#ifdef VITIS_ACCELERATOR
+#include "ap_axi_sdata.h"
+#endif
 namespace nnet {
 
 #ifndef __SYNTHESIS__
@@ -162,20 +164,22 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
     }
 }
 
-template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<dstType, 0, 0, 0>> &dst) {
+#ifdef VITIS_ACCELERATOR
+template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
     for (size_t i = 0; i < SIZE; i++) {
-        hls::axis<dstType, 0, 0, 0> ctype;
+        hls::axis<float, 0, 0, 0> ctype;
         ctype.data = dstType(src[i]);
         dst.write(ctype);
     }
 }
 
-template <typename srcType, class dstType, size_t SIZE> void convert_data(hls::stream<hls::axis<srcType, 0, 0, 0>> &src, dstType *dst) {
+template <typename srcType, class dstType, size_t SIZE> void convert_data(hls::stream<hls::axis<float, 0, 0, 0>> &src, dstType *dst) {
     for (size_t i = 0; i < SIZE; i++) {
-        hls::axis<srcType, 0, 0, 0> ctype = src.read();
+        hls::axis<float, 0, 0, 0> ctype = src.read();
         dst[i] = dstType(ctype.data);
     }
 }
+#endif
 
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
@@ -263,8 +267,6 @@ template <class data_T> void save_layer_output(hls::stream<data_T> &data, const
     }
 }
 
-#endif
-
 template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
     typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
     typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
@@ -287,16 +289,31 @@ void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
     }
 }
 
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
+// template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+//     for (auto i = 0; i < SIZE; i++) {
+//         dst[i].data = src[i];
+//         if (i == SIZE - 1) {
+//             dst[i].last = 1;
+//         } else {
+//             dst[i].last = 0;
+//         }
+//     }
+// }
+
+// #ifdef VITIS_ACCELERATOR
+template <class src_T, class dst_T, size_t SIZE> void copy_data_axi(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    for (auto i = 0; i < SIZE; i++) {
+        dst_T pack;
+        pack.data = src[i];
         if (i == SIZE - 1) {
-            dst[i].data = src[i];
-            dst[i].last = 1;
+            pack.last = 1;
         } else {
-            dst[i].data = src[i];
-            dst[i].last = 0;
+            pack.last = 0;
         }
+        dst.write(pack);
+    }
 }
+// #endif
 
 template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE; i++) {
@@ -305,29 +322,65 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE / res_T::size; i++) {
-        res_T res_pack = result.read();
-        for (int j = 0; j < res_T::size; j++) {
-            out << res_pack[j] << " ";
-        }
-        if (keep)
-            result.write(res_pack);
+// template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+//     for (int i = 0; i < SIZE / res_T::size; i++) {
+//         res_T res_pack = result.read();
+//         for (int j = 0; j < res_T::size; j++) {
+//             out << res_pack[j] << " ";
+//         }
+//         if (keep) {
+//             result.write(res_pack);
+//         }           
+//     }
+//     out << std::endl;
+// }
+
+// #ifdef VITIS_ACCELERATOR
+template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
+        res_T res_pack;
+        for (int j = 0; j < underlying_res_T::size; j++) {
+            res_pack = result.read();
+            out << res_pack.data << " ";
+            if (keep) {
+                result.write(res_pack);
+            }   
+        }        
     }
     out << std::endl;
 }
+// #endif
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
-    for (int i = 0; i < SIZE / data_T::size; i++) {
+// template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+//     for (int i = 0; i < SIZE / data_T::size; i++) {
+//         data_T data_pack;
+//         for (int j = 0; j < data_T::size; j++) {
+//             data_pack[j] = 0.;
+//         }
+//         data.write(data_pack);
+//     }
+// }
+
+// #ifdef VITIS_ACCELERATOR
+template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
         data_T data_pack;
-        for (int j = 0; j < data_T::size; j++) {
-            data_pack[j] = 0.;
+        for (int j = 0; j < underlying_data_T::size; j++) {
+            data_pack.data = 0.;
+            if ((i==(SIZE / underlying_data_T::size-1)) && (j==(underlying_data_T::size-1))) {
+                data_pack.last = 1;
+            }
+            else {
+                data_pack.last = 0;
+            }
+            data.write(data_pack);
         }
-        data.write(data_pack);
+        
     }
 }
+// #endif
 
 template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
     FILE *fp;
@@ -386,6 +439,7 @@ template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &dat
         res << datareg;
     }
 }
+#endif
 
 constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
 
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_writer.py
index 306de31bb8..9019021fa2 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_writer.py
@@ -40,7 +40,7 @@ def write_axi_wrapper(self, model):
                 newline += f'static const unsigned N_IN = {inp.size()};\n'
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vitis_accelerator_config.get_interface() == 'axi_stream':
-                    newline += f'typedef hls::axis<{inp_axi_t}, 0, 0, 0> my_pkt;;\n'
+                    newline += f'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
                 else: # TODO: handle this case
                     newline += f'typedef {inp_axi_t} input_axi_t;\n'
                     newline += f'typedef {out_axi_t} output_axi_t;\n'
@@ -277,20 +277,23 @@ def write_wrapper_test(self, model):
                 newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = (
-                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'hls::stream< my_pkt >')
+                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
                 )
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
                 newline = (
-                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'hls::stream< my_pkt >')
+                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
                 )
             else:
                 newline = line
             if self.vitis_accelerator_config.get_interface() == 'axi_stream':
                 if 'nnet::fill_zero' in line:
-                    indent = line.split('n')[0]
-                    newline = indent + 'inputs[N_IN-1].last = 1;\n'
+                    newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
+                    # indent = line.split('n')[0]
+                    # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
                 if 'copy_data' in line:
-                    newline = newline.replace('copy_data', 'copy_data_axi')
+                    newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
+                if 'print_result' in line:
+                    newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
             fout.write(newline)
 
         f.close()

From 1303bbaf67f3f756762fc49370602cb3d30f0a6e Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Mon, 18 Nov 2024 17:35:21 +0100
Subject: [PATCH 077/103] Fix tcl script to find cosim report

---
 hls4ml/templates/vivado/build_prj.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index af37b0f4aa..5714f05f1a 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -206,7 +206,7 @@ if {$opt(cosim)} {
 
     set time_end [clock clicks -milliseconds]
     puts "INFO:"
-    if {[string equal "$backend" "vivadoaccelerator"]} {
+    if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisaccelerator"]} {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
     } else {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]

From 8d3a1f27a1db1e8fd7ba231621e687041c768f17 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 19 Nov 2024 00:35:39 +0100
Subject: [PATCH 078/103] Correct PYNQ Z2 vivado tcl script, bitstream
 generated

---
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
index aa06e8a6d2..ecdfb2ac4b 100644
--- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -4,7 +4,7 @@ source [file join $tcldir project.tcl]
 
 create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force
 
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
 set_property  ip_repo_paths  ${project_name}_prj [current_project]
 update_ip_catalog
 
@@ -17,23 +17,29 @@ endgroup
 apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
 
 startgroup
-set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
+set_property -dict [list \
+  CONFIG.PCW_USE_S_AXI_HP0 {1} \
+  CONFIG.PCW_USE_S_AXI_HP2 {1} \
+] [get_bd_cells processing_system7_0]
+# set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
 endgroup
 
 startgroup
 create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
 endgroup
 
-set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
 
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+# set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+# set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+#
 
 startgroup
 create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
@@ -42,9 +48,23 @@ endgroup
 connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
 connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
 
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP2]
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins myproject_axi_0/ap_clk]
+endgroup
+
+
+# apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+
+#todo: make clock a variable
+# apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+validate_bd_design
+
+# group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
 
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd}
 
 make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
 
@@ -52,8 +72,10 @@ add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_
 
 reset_run impl_1
 reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
+#todo: make number of jobs a variable
+launch_runs impl_1 -to_step write_bitstream -jobs 18
 wait_on_run -timeout 360 impl_1
+#
 
 open_run impl_1
 report_utilization -file util.rpt -hierarchical -hierarchical_percentages

From a8e04978bb1ddbf681257ba4322b16f4866c2987 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 19 Nov 2024 10:32:24 +0100
Subject: [PATCH 079/103] Clean pynq tcl script

---
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl    | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
index ecdfb2ac4b..c481995dae 100644
--- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -21,7 +21,6 @@ set_property -dict [list \
   CONFIG.PCW_USE_S_AXI_HP0 {1} \
   CONFIG.PCW_USE_S_AXI_HP2 {1} \
 ] [get_bd_cells processing_system7_0]
-# set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
 endgroup
 
 startgroup
@@ -37,10 +36,6 @@ set_property -dict [list \
   CONFIG.c_sg_length_width {26} \
 ] [get_bd_cells axi_dma_0]
 
-# set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-# set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
-#
-
 startgroup
 create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
 endgroup
@@ -48,6 +43,7 @@ endgroup
 connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
 connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
 
+#todo: make clock a variable
 startgroup
 apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
 apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
@@ -55,15 +51,8 @@ apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Cl
 apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins myproject_axi_0/ap_clk]
 endgroup
 
-
-# apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-
-#todo: make clock a variable
-# apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
 validate_bd_design
 
-# group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
-
 open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd}
 
 make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
@@ -73,9 +62,8 @@ add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_
 reset_run impl_1
 reset_run synth_1
 #todo: make number of jobs a variable
-launch_runs impl_1 -to_step write_bitstream -jobs 18
+launch_runs impl_1 -to_step write_bitstream -jobs 10
 wait_on_run -timeout 360 impl_1
-#
 
 open_run impl_1
 report_utilization -file util.rpt -hierarchical -hierarchical_percentages

From 48686d33ed5d37ee9208eab8c361581b467bae6d Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 19 Nov 2024 12:14:49 +0100
Subject: [PATCH 080/103] Fix compatibility of nnet helper functions with vitis
 axis

---
 .../vivado/nnet_utils/nnet_helpers.h          | 78 +++++++++----------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 88a6561f7d..9949ee7d80 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -11,6 +11,8 @@
 #include <stdlib.h>
 #include <vector>
 
+// this header cannot be included by Vivado HLS
+// "VITIS_ACCELERATOR" is defined on the build_lib.sh of the Vitis Accelerator backend files
 #ifdef VITIS_ACCELERATOR
 #include "ap_axi_sdata.h"
 #endif
@@ -289,18 +291,17 @@ void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
     }
 }
 
-// template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-//     for (auto i = 0; i < SIZE; i++) {
-//         dst[i].data = src[i];
-//         if (i == SIZE - 1) {
-//             dst[i].last = 1;
-//         } else {
-//             dst[i].last = 0;
-//         }
-//     }
-// }
-
-// #ifdef VITIS_ACCELERATOR
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++) {
+        dst[i].data = src[i];
+        if (i == SIZE - 1) {
+            dst[i].last = 1;
+        } else {
+            dst[i].last = 0;
+        }
+    }
+}
+
 template <class src_T, class dst_T, size_t SIZE> void copy_data_axi(std::vector<src_T> src, hls::stream<dst_T> &dst) {
     for (auto i = 0; i < SIZE; i++) {
         dst_T pack;
@@ -313,7 +314,6 @@ template <class src_T, class dst_T, size_t SIZE> void copy_data_axi(std::vector<
         dst.write(pack);
     }
 }
-// #endif
 
 template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE; i++) {
@@ -322,20 +322,20 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-// template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
-//     for (int i = 0; i < SIZE / res_T::size; i++) {
-//         res_T res_pack = result.read();
-//         for (int j = 0; j < res_T::size; j++) {
-//             out << res_pack[j] << " ";
-//         }
-//         if (keep) {
-//             result.write(res_pack);
-//         }           
-//     }
-//     out << std::endl;
-// }
-
-// #ifdef VITIS_ACCELERATOR
+template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep) {
+            result.write(res_pack);
+        }           
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
 template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
         res_T res_pack;
@@ -349,21 +349,20 @@ template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hl
     }
     out << std::endl;
 }
-// #endif
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-// template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
-//     for (int i = 0; i < SIZE / data_T::size; i++) {
-//         data_T data_pack;
-//         for (int j = 0; j < data_T::size; j++) {
-//             data_pack[j] = 0.;
-//         }
-//         data.write(data_pack);
-//     }
-// }
-
-// #ifdef VITIS_ACCELERATOR
+template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
 template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
         data_T data_pack;
@@ -380,7 +379,6 @@ template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls
         
     }
 }
-// #endif
 
 template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
     FILE *fp;

From bae450b04b0cd6517d7044b6e2466d6e9949567d Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 19 Nov 2024 15:04:25 +0100
Subject: [PATCH 081/103] Setup vivado tcl script for zcu102

---
 .../pynq-z2/tcl_scripts/axi_lite_design.tcl   | 26 -------------
 .../zcu102/tcl_scripts/axi_stream_design.tcl  | 37 +++++++++++--------
 2 files changed, 22 insertions(+), 41 deletions(-)
 delete mode 100644 hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl

diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
deleted file mode 100644
index 4d23da26cc..0000000000
--- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
+++ /dev/null
@@ -1,26 +0,0 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force
-
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-# Create Block Designer design
-create_bd_design "design_1"
-create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
-apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
-
-make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
index 5d886c6f25..103fec0178 100644
--- a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -13,37 +13,44 @@ set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_proje
 update_ip_catalog
 
 startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1
 endgroup
 
-apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_1]
 
-set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+set_property -dict [list \
+  CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \
+  CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \
+  CONFIG.PSU__USE__S_AXI_GP2 {1} \
+  CONFIG.PSU__USE__S_AXI_GP4 {1} \
+] [get_bd_cells zynq_ultra_ps_e_1]
 
 startgroup
 create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
 endgroup
-set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
 
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
-endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
 
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
-endgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD]
 
 startgroup
 create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
 endgroup
+
 connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
 connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
 
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
 
 make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
 

From dde91243bb56815671207aa69ea00c5cee5c608b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Tue, 19 Nov 2024 15:47:19 +0100
Subject: [PATCH 082/103] Rename backend to VitisAcceleratorIPFLow to prevent
 conflicts with kernel flow-versal

---
 hls4ml/backends/__init__.py                   |  6 +-
 .../__init__.py                               |  0
 .../passes/__init__.py                        |  0
 .../passes/fifo_depth_optimization.py         |  0
 .../supported_boards.json                     |  0
 .../vitis_accelerator_ip_flow_backend.py}     | 12 ++--
 .../vitis_accelerator_ip_flow_config.py}      | 16 ++---
 .../build_lib.sh                              |  2 +-
 .../myproject_axi.cpp                         |  0
 .../myproject_axi.h                           |  0
 .../python_drivers/axi_stream_driver.py       |  0
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl |  8 +--
 .../python_drivers/axi_stream_driver.py       |  0
 .../zcu102/tcl_scripts/axi_stream_design.tcl  |  0
 hls4ml/templates/vivado/build_prj.tcl         |  2 +-
 .../vivado/nnet_utils/nnet_helpers.h          |  6 +-
 hls4ml/writer/__init__.py                     |  4 +-
 ...py => vitis_accelerator_ip_flow_writer.py} | 60 +++++++++----------
 18 files changed, 58 insertions(+), 58 deletions(-)
 rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/__init__.py (100%)
 rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/passes/__init__.py (100%)
 rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/passes/fifo_depth_optimization.py (100%)
 rename hls4ml/backends/{vitis_accelerator => vitis_accelerator_ip_flow}/supported_boards.json (100%)
 rename hls4ml/backends/{vitis_accelerator/vitis_accelerator_backend.py => vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py} (95%)
 rename hls4ml/backends/{vitis_accelerator/vitis_accelerator_config.py => vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py} (90%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/build_lib.sh (92%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/myproject_axi.cpp (100%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/myproject_axi.h (100%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/pynq-z2/python_drivers/axi_stream_driver.py (100%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/pynq-z2/tcl_scripts/axi_stream_design.tcl (88%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/zcu102/python_drivers/axi_stream_driver.py (100%)
 rename hls4ml/templates/{vitis_accelerator => vitis_accelerator_ip_flow}/zcu102/tcl_scripts/axi_stream_design.tcl (100%)
 rename hls4ml/writer/{vitis_accelerator_writer.py => vitis_accelerator_ip_flow_writer.py} (89%)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index f1eebd3c1f..1f60bdb449 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -10,13 +10,13 @@
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
-from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend  # isort: skip
-from hls4ml.backends.vitis_accelerator.vitis_accelerator_config import VitisAcceleratorConfig  # noqa: F401
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import VitisAcceleratorIPFlowBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import VitisAcceleratorIPFlowConfig  # noqa: F401
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
-register_backend('VitisAccelerator', VitisAcceleratorBackend)
+register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py
similarity index 100%
rename from hls4ml/backends/vitis_accelerator/__init__.py
rename to hls4ml/backends/vitis_accelerator_ip_flow/__init__.py
diff --git a/hls4ml/backends/vitis_accelerator/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py
similarity index 100%
rename from hls4ml/backends/vitis_accelerator/passes/__init__.py
rename to hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py
diff --git a/hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
similarity index 100%
rename from hls4ml/backends/vitis_accelerator/passes/fifo_depth_optimization.py
rename to hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
diff --git a/hls4ml/backends/vitis_accelerator/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
similarity index 100%
rename from hls4ml/backends/vitis_accelerator/supported_boards.json
rename to hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
similarity index 95%
rename from hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
rename to hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 2e3de9a1cd..6ade53b39d 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -5,9 +5,9 @@
 from hls4ml.report import parse_vivado_report
 
 
-class VitisAcceleratorBackend(VitisBackend):
+class VitisAcceleratorIPFlowBackend(VitisBackend):
     def __init__(self):
-        super(VivadoBackend, self).__init__(name='VitisAccelerator')
+        super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow')
         self._register_layer_attributes()
         self._register_flows()
 
@@ -21,7 +21,7 @@ def build(
         validation=False,
         export=False,
         vsynth=False,
-        fifo_opt=False,
+        # fifo_opt=False,
         bitfile=False,
     ):
         # run the VitisBackend build
@@ -37,9 +37,9 @@ def build(
             # fifo_opt=fifo_opt,
         )
         # Get Config to view Board and Platform
-        from hls4ml.backends import VitisAcceleratorConfig
+        from hls4ml.backends import VitisAcceleratorIPFlowConfig
 
-        vitis_accelerator_config = VitisAcceleratorConfig(
+        vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
             model.config, model.get_input_variables(), model.get_output_variables()
         )
         # now make a bitfile
@@ -154,7 +154,7 @@ def get_writer_flow(self):
 
     def _register_flows(self):
         vivado_ip = 'vivado:ip'
-        writer_passes = ['make_stamp', 'vitisaccelerator:write_hls']
+        writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
         self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
         self._default_flow = vivado_ip
 
diff --git a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
similarity index 90%
rename from hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
rename to hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
index b0bf4e894b..d00e54a284 100644
--- a/hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
@@ -6,7 +6,7 @@
 from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
 
 
-class VitisAcceleratorConfig:
+class VitisAcceleratorIPFlowConfig:
     def __init__(self, config, model_inputs, model_outputs):
         self.config = config.config
         self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')
@@ -54,10 +54,10 @@ def __init__(self, config, model_inputs, model_outputs):
 
         assert (
             len(model_inputs) == 1
-        ), "Only models with one input tensor are currently supported by VitisAcceleratorBackend"
+        ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend"
         assert (
             len(model_outputs) == 1
-        ), "Only models with one output tensor are currently supported by VitisAcceleratorBackend"
+        ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend"
         self.inp = model_inputs[0]
         self.out = model_outputs[0]
         inp_axi_t = self.input_type
@@ -131,16 +131,16 @@ def get_clock_period(self):
 
     def get_driver_path(self):
         if self.board.startswith('alveo'):
-            return '../templates/vitis_accelerator/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
         else:
-            return '../templates/vitis_accelerator/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
+            return '../templates/vitis_accelerator_ip_flow/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
 
     def get_driver_file(self):
         driver_ext = '.py' if self.driver == 'python' else '.h'
         return self.interface + '_driver' + driver_ext
 
     def get_krnl_rtl_src_dir(self):
-        return '../templates/vitis_accelerator/' + 'alveo/' + '/krnl_rtl_src'
+        return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src'
 
     def get_input_type(self):
         return self.input_type
@@ -157,6 +157,6 @@ def get_tcl_file_path(self):
         if tcl_script is None:
             raise Exception('No tcl script definition available for the desired interface in supported_board.json')
         if self.board.startswith('alveo'):
-            return '../templates/vitis_accelerator/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script
         else:
-            return '../templates/vitis_accelerator/' + self.board + '/tcl_scripts/' + tcl_script
+            return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script
diff --git a/hls4ml/templates/vitis_accelerator/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
similarity index 92%
rename from hls4ml/templates/vitis_accelerator/build_lib.sh
rename to hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
index db929714cf..262ce00d63 100644
--- a/hls4ml/templates/vitis_accelerator/build_lib.sh
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
@@ -6,7 +6,7 @@ if [[ "$OSTYPE" == "linux-gnu" ]]; then
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     CFLAGS="-O3 -fPIC -std=c++11"
 fi
-VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR"
+VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW"
 CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS"
 
 INCFLAGS="-Ifirmware/ap_types/"
diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
similarity index 100%
rename from hls4ml/templates/vitis_accelerator/myproject_axi.cpp
rename to hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
diff --git a/hls4ml/templates/vitis_accelerator/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
similarity index 100%
rename from hls4ml/templates/vitis_accelerator/myproject_axi.h
rename to hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
similarity index 100%
rename from hls4ml/templates/vitis_accelerator/pynq-z2/python_drivers/axi_stream_driver.py
rename to hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
diff --git a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
similarity index 88%
rename from hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
rename to hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
index c481995dae..e8db1e6782 100644
--- a/hls4ml/templates/vitis_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -2,7 +2,7 @@
 set tcldir [file dirname [info script]]
 source [file join $tcldir project.tcl]
 
-create_project project_1 ${project_name}_vitis_accelerator -part xc7z020clg400-1 -force
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force
 
 # set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
 set_property  ip_repo_paths  ${project_name}_prj [current_project]
@@ -53,11 +53,11 @@ endgroup
 
 validate_bd_design
 
-open_bd_design {./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd}
+open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd}
 
-make_wrapper -files [get_files ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
 
-add_files -norecurse ./${project_name}_vitis_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
 
 reset_run impl_1
 reset_run synth_1
diff --git a/hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
similarity index 100%
rename from hls4ml/templates/vitis_accelerator/zcu102/python_drivers/axi_stream_driver.py
rename to hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
diff --git a/hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
similarity index 100%
rename from hls4ml/templates/vitis_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
rename to hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 5714f05f1a..50596091f2 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -206,7 +206,7 @@ if {$opt(cosim)} {
 
     set time_end [clock clicks -milliseconds]
     puts "INFO:"
-    if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisaccelerator"]} {
+    if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
     } else {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 9949ee7d80..2942cf08fa 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -12,8 +12,8 @@
 #include <vector>
 
 // this header cannot be included by Vivado HLS
-// "VITIS_ACCELERATOR" is defined on the build_lib.sh of the Vitis Accelerator backend files
-#ifdef VITIS_ACCELERATOR
+// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files
+#ifdef VITIS_ACCELERATOR_IP_FLOW
 #include "ap_axi_sdata.h"
 #endif
 namespace nnet {
@@ -166,7 +166,7 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
     }
 }
 
-#ifdef VITIS_ACCELERATOR
+#ifdef VITIS_ACCELERATOR_IP_FLOW
 template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
     for (size_t i = 0; i < SIZE; i++) {
         hls::axis<float, 0, 0, 0> ctype;
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index c49b23f58c..70a2eabd75 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -2,7 +2,7 @@
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
 from hls4ml.writer.vitis_writer import VitisWriter
-from hls4ml.writer.vitis_accelerator_writer import VitisAcceleratorWriter
+from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.writers import Writer, get_writer, register_writer  # noqa: F401
@@ -10,7 +10,7 @@
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
-register_writer('VitisAccelerator', VitisAcceleratorWriter)
+register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter)
 register_writer('Quartus', QuartusWriter)
 register_writer('Catapult', CatapultWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/vitis_accelerator_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
similarity index 89%
rename from hls4ml/writer/vitis_accelerator_writer.py
rename to hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index 9019021fa2..e1817b87e5 100644
--- a/hls4ml/writer/vitis_accelerator_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -6,17 +6,17 @@
 from hls4ml.writer.vitis_writer import VitisWriter
 
 
-class VitisAcceleratorWriter(VitisWriter):
+class VitisAcceleratorIPFlowWriter(VitisWriter):
     def __init__(self):
         super().__init__()
-        self.vitis_accelerator_config = None
+        self.vitis_accelerator_ip_flow_config = None
 
     def write_axi_wrapper(self, model):
         '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
         Args:
             model : The ModelGraph to write the wrapper for
         '''
-        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types()
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
         indent = '    '
 
         #######################
@@ -24,7 +24,7 @@ def write_axi_wrapper(self, model):
         #######################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.h'))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h'))
         fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
 
         for line in f.readlines():
@@ -39,7 +39,7 @@ def write_axi_wrapper(self, model):
                 newline = ''
                 newline += f'static const unsigned N_IN = {inp.size()};\n'
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
-                if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                     newline += f'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
                 else: # TODO: handle this case
                     newline += f'typedef {inp_axi_t} input_axi_t;\n'
@@ -54,7 +54,7 @@ def write_axi_wrapper(self, model):
         # myproject_axi.cpp
         #######################
 
-        f = open(os.path.join(filedir, '../templates/vitis_accelerator/myproject_axi.cpp'))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp'))
         fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
 
         io_type = model.config.get_config_value("IOType")
@@ -66,7 +66,7 @@ def write_axi_wrapper(self, model):
                 newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
             elif '// hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
-                # if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                # if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                 #     newline += indent + 'bool is_last = false;\n'
                 if io_type == 'io_parallel': # TODO: handle io_parallel
                     newline += indent + inp.type.name + ' in_local[N_IN];\n'
@@ -83,12 +83,12 @@ def write_axi_wrapper(self, model):
             elif '// hls-fpga-machine-learning insert call' in line:
                 newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
             elif '// hls-fpga-machine-learning insert interface' in line:
-                if self.vitis_accelerator_config.get_interface() == 'axi_lite': # TODO: handle axi_lite
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
-                elif self.vitis_accelerator_config.get_interface() == 'axi_master': # TODO: handle axi_master
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
                     newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
@@ -97,7 +97,7 @@ def write_axi_wrapper(self, model):
                     newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
                         model.get_output_variables()[0].pragma[1]
                     )
-                elif self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE axis port=in\n'
                     newline += indent + '#pragma HLS INTERFACE axis port=out\n'
@@ -109,7 +109,7 @@ def write_axi_wrapper(self, model):
                 if io_type == 'io_parallel': # TODO: handle io_parallel
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
-                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += indent + indent + '#pragma HLS PIPELINE\n'
                         newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
                         newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n'
@@ -130,7 +130,7 @@ def write_axi_wrapper(self, model):
                     # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed
                     newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
-                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += (
                             indent
                             + indent
@@ -164,7 +164,7 @@ def write_axi_wrapper(self, model):
                 if io_type == 'io_parallel':  # TODO: handle this case
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
-                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += indent + indent + '#pragma HLS PIPELINE\n'
                         newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n'
                         newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n'
@@ -179,7 +179,7 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + '{result_t} ctype = out_local.read();\n'
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
-                    if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += (
                             indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n'
                         )
@@ -235,7 +235,7 @@ def modify_build_script(self, model):
         # build_lib.sh
         ###################
 
-        f = open(os.path.join(filedir, '../templates/vitis_accelerator/build_lib.sh'))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh'))
         fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
 
         for line in f.readlines():
@@ -253,7 +253,7 @@ def write_wrapper_test(self, model):
         oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
         newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
         
-        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_config.get_corrected_types()
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
 
         f = open(oldfile)
         fout = open(newfile, 'w')
@@ -285,7 +285,7 @@ def write_wrapper_test(self, model):
                 )
             else:
                 newline = line
-            if self.vitis_accelerator_config.get_interface() == 'axi_stream':
+            if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                 if 'nnet::fill_zero' in line:
                     newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
                     # indent = line.split('n')[0]
@@ -338,16 +338,16 @@ def write_wrapper_test(self, model):
 
     def write_board_script(self, model):
         '''
-        Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAccelerator
+        Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow
         '''
         filedir = os.path.dirname(os.path.abspath(__file__))
         copyfile(
-            os.path.join(filedir, self.vitis_accelerator_config.get_tcl_file_path()),
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()),
             f'{model.config.get_output_dir()}/design.tcl',
         )
         # Generic alveo board
-        if self.vitis_accelerator_config.get_board().startswith('alveo'):
-            src_dir = os.path.join(filedir, self.vitis_accelerator_config.get_krnl_rtl_src_dir())
+        if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'):
+            src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir())
             dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
             copy_tree(src_dir, dst_dir)
 
@@ -358,17 +358,17 @@ def write_board_script(self, model):
         f.write('variable project_name\n')
         f.write(f'set project_name "{model.config.get_project_name()}"\n')
         f.write('variable backend\n')
-        f.write('set backend "vitisaccelerator"\n')
+        f.write('set backend "vitisacceleratoripflow"\n')
         f.write('variable part\n')
-        f.write(f'set part "{self.vitis_accelerator_config.get_part()}"\n')
+        f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n')
         f.write('variable clock_period\n')
         f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
         f.write('variable clock_uncertainty\n')
         f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
         f.write('variable version\n')
         f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
-        if self.vitis_accelerator_config.get_interface() == 'axi_stream':
-            in_bit, out_bit = self.vitis_accelerator_config.get_io_bitwidth()
+        if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+            in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth()
             f.write(f'set bit_width_hls_output {in_bit}\n')
             f.write(f'set bit_width_hls_input {out_bit}\n')
         f.close()
@@ -376,8 +376,8 @@ def write_board_script(self, model):
     def write_driver(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
         copyfile(
-            os.path.join(filedir, self.vitis_accelerator_config.get_driver_path()),
-            ('{}/' + self.vitis_accelerator_config.get_driver_file()).format(model.config.get_output_dir()),
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()),
+            ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()),
         )
 
     def write_new_tar(self, model):
@@ -386,12 +386,12 @@ def write_new_tar(self, model):
 
     def write_hls(self, model):
         """
-        Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAccelerator/AXI interface
+        Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface
         """
         # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
-        from hls4ml.backends import VitisAcceleratorConfig
+        from hls4ml.backends import VitisAcceleratorIPFlowConfig
 
-        self.vitis_accelerator_config = VitisAcceleratorConfig(
+        self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
             model.config, model.get_input_variables(), model.get_output_variables()
         )
         super().write_hls(model)

From 663181ff0f640f0467a1a0507927db3731a1a8f6 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 20 Nov 2024 12:05:55 +0100
Subject: [PATCH 083/103] Fix compatiblity between axi stream and io parallel

---
 .../vivado/nnet_utils/nnet_helpers.h          | 43 +++++++++++++-
 .../vitis_accelerator_ip_flow_writer.py       | 56 ++++++++++---------
 2 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 2942cf08fa..389d687089 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -167,6 +167,7 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
 }
 
 #ifdef VITIS_ACCELERATOR_IP_FLOW
+//todo avoid hardcoding hls::axis<float, 0, 0, 0> and use template
 template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
     for (size_t i = 0; i < SIZE; i++) {
         hls::axis<float, 0, 0, 0> ctype;
@@ -322,7 +323,9 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+template <class res_T, size_t SIZE,
+          typename std::enable_if<std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / res_T::size; i++) {
         res_T res_pack = result.read();
         for (int j = 0; j < res_T::size; j++) {
@@ -335,7 +338,23 @@ template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result
     out << std::endl;
 }
 
-// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
+// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel
+template <class res_T, size_t SIZE,
+          typename std::enable_if<!std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        res_T res_pack = result.read();
+        
+        out << res_pack.data << " ";
+        
+        if (keep) {
+            result.write(res_pack);
+        }           
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...> and io_stream
 template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
         res_T res_pack;
@@ -352,7 +371,9 @@ template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hl
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+template <class data_T, size_t SIZE,
+          typename std::enable_if<std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / data_T::size; i++) {
         data_T data_pack;
         for (int j = 0; j < data_T::size; j++) {
@@ -362,6 +383,22 @@ template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
     }
 }
 
+template <class data_T, size_t SIZE,
+          typename std::enable_if<!std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE; i++) {
+        data_T data_pack;
+        data_pack.data = 0.;
+        if (i==SIZE-1) {
+            data_pack.last = 1;
+        }
+        else {
+            data_pack.last = 0;
+        }
+        data.write(data_pack);
+    }
+}
+
 // compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
 template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index e1817b87e5..535a43b4bc 100644
--- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -66,11 +66,12 @@ def write_axi_wrapper(self, model):
                 newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
             elif '// hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
-                # if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                #     newline += indent + 'bool is_last = false;\n'
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline += indent + 'bool is_last = false;\n'
                 if io_type == 'io_parallel': # TODO: handle io_parallel
                     newline += indent + inp.type.name + ' in_local[N_IN];\n'
-                    newline += indent + out.type.name + ' out_local[N_OUT];\n'
+                    newline += indent + out.type.name + ' out_local[N_OUT];\n'             
+                    newline += indent + 'my_pkt tmp;\n'
                 elif io_type == 'io_stream':
                     newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
                     newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
@@ -111,17 +112,17 @@ def write_axi_wrapper(self, model):
                     newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += indent + indent + '#pragma HLS PIPELINE\n'
-                        newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
-                        newline += indent + indent + 'is_last |= (in[i].last == 1)? true: false;\n'
+                        newline += indent + indent + 'tmp = in.read(); // Read input with cast\n'
+                        newline += indent + indent + 'in_local[i] = tmp.data;\n'
+                        newline += indent + indent + 'is_last = tmp.last;\n'
                     else:
                         newline += indent + indent + '#pragma HLS UNROLL\n'
                         newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
                     newline += indent + '}\n'
+                    newline += indent + 'tmp.last = 0;\n'
                 elif io_type == 'io_stream':
                     newline = ''
-                    newline += indent + 'my_pkt tmp_a;\n'
-
-                    newline += indent + 'my_pkt tmp_b;\n'
+                    newline += indent + 'my_pkt tmp;\n'
 
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
@@ -135,17 +136,17 @@ def write_axi_wrapper(self, model):
                             indent
                             + indent
                             + indent
-                            + 'in.read(tmp_a);\n'
+                            + 'in.read(tmp);\n'
                         )
                         newline += (
                             indent
                             + indent
                             + indent
-                            + 'ctype[j] = tmp_a.data;\n'
+                            + 'ctype[j] = tmp.data;\n'
+                        )
+                        newline += (
+                            indent + indent + indent + 'is_last = tmp.last;\n'
                         )
-                        # newline += (
-                        #     indent + indent + indent + 'is_last |= (in[i * input_t::size + j].last == 1)? true : false;\n'
-                        # )
                     else: # TODO: handle this case
                         newline += (
                             indent
@@ -156,8 +157,7 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + '}}\n'
                     newline += indent + indent + 'in_local.write(ctype);\n'
                     newline += indent + '}}\n'
-                    newline += indent + 'tmp_b = tmp_a;\n'
-                    newline += indent + 'tmp_b.last = 0;\n'
+                    newline += indent + 'tmp.last = 0;\n'
                     newline = newline.format(input_t=inp.type.name)
             elif '// hls-fpga-machine-learning insert dequeue' in line:
                 io_type = model.config.get_config_value("IOType")
@@ -166,8 +166,9 @@ def write_axi_wrapper(self, model):
                     newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += indent + indent + '#pragma HLS PIPELINE\n'
-                        newline += indent + indent + 'out[i].data = out_local[i]; // Write output with cast\n'
-                        newline += indent + indent + 'out[i].last = (is_last && (i == N_OUT - 1))? true : false;\n'
+                        newline += indent + indent + 'tmp.data = out_local[i];\n'
+                        newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n'
+                        newline += indent + indent + 'out.write(tmp);\n'
                     else:
                         newline += indent + indent + '#pragma HLS UNROLL\n'
                         newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
@@ -181,15 +182,15 @@ def write_axi_wrapper(self, model):
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                         newline += (
-                            indent + indent + indent + f'tmp_b.data = ({inp_axi_t}) (ctype[j]);\n'
+                            indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
                         )
 
                         newline += (
-                            indent + indent + indent + 'if(tmp_a.last == 1) {{tmp_b.last = (((i+1)*(j+1))==N_OUT);}}\n'
+                            indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
                         )
 
                         newline += (
-                            indent + indent + indent + 'out.write(tmp_b);\n'
+                            indent + indent + indent + 'out.write(tmp);\n'
                         )
                     else:
                         newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
@@ -260,6 +261,7 @@ def write_wrapper_test(self, model):
 
         inp = model.get_input_variables()[0]
         out = model.get_output_variables()[0]
+        io_type = model.config.get_config_value("IOType")
 
         for line in f.readlines():
             if f'{model.config.get_project_name()}.h' in line:
@@ -286,14 +288,16 @@ def write_wrapper_test(self, model):
             else:
                 newline = line
             if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                if 'nnet::fill_zero' in line:
-                    newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
-                    # indent = line.split('n')[0]
-                    # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
                 if 'copy_data' in line:
                     newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
-                if 'print_result' in line:
-                    newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
+                    
+                if io_type == 'io_stream':
+                    if 'nnet::fill_zero' in line:
+                        newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
+                        # indent = line.split('n')[0]
+                        # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
+                    if 'print_result' in line:
+                        newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
             fout.write(newline)
 
         f.close()

From e32f4d0b2763452bb82095d61d5f51fba9220187 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 20 Nov 2024 13:36:43 +0100
Subject: [PATCH 084/103] Update pynq driver for zcu102

---
 .../python_drivers/axi_stream_driver.py       | 91 +++++++++++--------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
index 1aac79f2d3..fda308e9ca 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
@@ -1,39 +1,40 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
 from datetime import datetime
-
+import pynq.lib.dma
 import numpy as np
-from pynq import Overlay, allocate
+from pynq import PL
 
 
 class NeuralNetworkOverlay(Overlay):
-    def __init__(
-        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
-    ):
+    def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None):
         super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
-        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
-        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
-        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
-        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
-
+    
     def _print_dt(self, timea, timeb, N):
-        dt = timeb - timea
+        dt = (timeb - timea) 
         dts = dt.seconds + dt.microseconds * 10**-6
         rate = N / dts
-        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
         return dts, rate
-
-    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+    
+    def reset_PL():
+        PL.reset()
+        
+    def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
         """
         Obtain the predictions of the NN implemented in the FPGA.
         Parameters:
         - X : the input vector. Should be numpy ndarray.
-        - dtype : the data type of the elements of the input/output vectors.
-                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
-                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+        - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
+                    for sizing the output vector shape.
+        - dtype : the data type of the elements of the input/output vectors. 
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float' 
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. 
                   Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
-                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` 
                   doc for more info).
-                  In this case the encoding/decoding has to be computed by the PS. For example for
-                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  In this case the encoding/decoding has to be computed by the PS. For example for 
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 
                   'float' -> 'ap_fixed<16,6>':
                   ```
                     def encode(xi):
@@ -48,28 +49,38 @@ def decode(yi):
         - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
                   the namesake parameter.
         """
-        if profile:
-            timea = datetime.now()
+
         if encode is not None:
             X = encode(X)
-        self.input_buffer[:] = X
-        self.sendchannel.transfer(self.input_buffer)
-        self.recvchannel.transfer(self.output_buffer)
-        if debug:
-            print("Transfer OK")
-        self.sendchannel.wait()
-        if debug:
-            print("Send OK")
-        self.recvchannel.wait()
-        if debug:
-            print("Receive OK")
-        # result = self.output_buffer.copy()
+        with allocate(shape=X.shape, dtype=dtype) as input_buffer, \
+             allocate(shape=y_shape, dtype=dtype) as output_buffer:
+            input_buffer[:] = X
+            
+            if profile:
+                timea = datetime.now()
+            
+            self.axi_dma_0.sendchannel.transfer(input_buffer)
+            self.axi_dma_0.recvchannel.transfer(output_buffer)
+            if debug:
+                print("Transfer OK")
+            self.axi_dma_0.sendchannel.wait()
+            if debug:
+                print("Send OK")
+            self.axi_dma_0.recvchannel.wait()
+            
+            if profile:
+                timeb = datetime.now()
+            
+            if debug:
+                print("Receive OK")
+              
+            result = output_buffer.copy()
+            
         if decode is not None:
-            self.output_buffer = decode(self.output_buffer)
-
+            result = decode(result)
+            
         if profile:
-            timeb = datetime.now()
             dts, rate = self._print_dt(timea, timeb, len(X))
-            return self.output_buffer, dts, rate
-        else:
-            return self.output_buffer
+            return result, dts, rate
+
+        return result
\ No newline at end of file

From c52ec75ade7c59166c163bbc7a7dcce6eaa67601 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 20 Nov 2024 14:06:41 +0100
Subject: [PATCH 085/103] Run pre-commit

---
 hls4ml/backends/__init__.py                   | 10 ++-
 .../vitis_accelerator_ip_flow_backend.py      | 15 ++---
 .../vitis_accelerator_ip_flow_config.py       |  9 ++-
 .../myproject_axi.cpp                         |  2 +-
 .../vitis_accelerator_ip_flow/myproject_axi.h |  2 +-
 .../python_drivers/axi_stream_driver.py       | 47 +++++++-------
 .../vivado/nnet_utils/nnet_helpers.h          | 46 +++++++-------
 hls4ml/writer/__init__.py                     |  2 +-
 .../vitis_accelerator_ip_flow_writer.py       | 62 +++++++------------
 9 files changed, 88 insertions(+), 107 deletions(-)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 1f60bdb449..68562d75ed 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -2,16 +2,20 @@
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
-
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (  # noqa: F401
+    VitisAcceleratorIPFlowConfig,
+)
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (  # isort: skip
+    VitisAcceleratorIPFlowBackend,
+)
+
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import VitisAcceleratorIPFlowBackend  # isort: skip
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import VitisAcceleratorIPFlowConfig  # noqa: F401
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 6ade53b39d..cd57df5a4a 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -37,22 +37,19 @@ def build(
             # fifo_opt=fifo_opt,
         )
         # Get Config to view Board and Platform
-        from hls4ml.backends import VitisAcceleratorIPFlowConfig
+        # from hls4ml.backends import VitisAcceleratorIPFlowConfig
 
-        vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
-            model.config, model.get_input_variables(), model.get_output_variables()
-        )
+        # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+        #     model.config, model.get_input_variables(), model.get_output_variables()
+        # )
         # now make a bitfile
         if bitfile:
-            # if vitis_accelerator_config.get_board().startswith('alveo'):
-            #     self.make_xclbin(model, vitis_accelerator_config.get_platform())
-            # else:
             curr_dir = os.getcwd()
             os.chdir(model.config.get_output_dir())
             try:
-                os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command
+                os.system('vivado -mode batch -source design.tcl')  # check if this is accepted as a command
             except Exception:
-                print("Something went wrong, check the Vivado logs") 
+                print("Something went wrong, check the Vivado logs")
             os.chdir(curr_dir)
 
         return parse_vivado_report(model.config.get_output_dir())
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
index d00e54a284..07961a9b6f 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
@@ -133,7 +133,14 @@ def get_driver_path(self):
         if self.board.startswith('alveo'):
             return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
         else:
-            return '../templates/vitis_accelerator_ip_flow/' + self.board + '/' + self.driver + '_drivers/' + self.get_driver_file()
+            return (
+                '../templates/vitis_accelerator_ip_flow/'
+                + self.board
+                + '/'
+                + self.driver
+                + '_drivers/'
+                + self.get_driver_file()
+            )
 
     def get_driver_file(self):
         driver_ext = '.py' if self.driver == 'python' else '.h'
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
index 01238643ed..cf6c0b9c25 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
@@ -1,6 +1,6 @@
 // hls-fpga-machine-learning insert include
 
-void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out) {
+void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out) {
 
     // hls-fpga-machine-learning insert interface
 
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
index d49f98ba14..d0d88bfecf 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
@@ -6,5 +6,5 @@
 
 // hls-fpga-machine-learning insert definitions
 
-void myproject_axi(hls::stream< my_pkt > &in, hls::stream< my_pkt > &out);
+void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out);
 #endif
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
index fda308e9ca..1d70e55406 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
@@ -1,25 +1,23 @@
-from pynq import DefaultHierarchy, DefaultIP, allocate
-from pynq import Overlay
 from datetime import datetime
-import pynq.lib.dma
+
 import numpy as np
-from pynq import PL
+from pynq import PL, Overlay, allocate
 
 
 class NeuralNetworkOverlay(Overlay):
     def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None):
         super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
-    
+
     def _print_dt(self, timea, timeb, N):
-        dt = (timeb - timea) 
+        dt = timeb - timea
         dts = dt.seconds + dt.microseconds * 10**-6
         rate = N / dts
-        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
         return dts, rate
-    
+
     def reset_PL():
         PL.reset()
-        
+
     def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
         """
         Obtain the predictions of the NN implemented in the FPGA.
@@ -27,14 +25,14 @@ def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encod
         - X : the input vector. Should be numpy ndarray.
         - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
                     for sizing the output vector shape.
-        - dtype : the data type of the elements of the input/output vectors. 
-                  Note: it should be set depending on the interface of the accelerator; if it uses 'float' 
-                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. 
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
                   Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
-                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` 
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
                   doc for more info).
-                  In this case the encoding/decoding has to be computed by the PS. For example for 
-                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode 
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
                   'float' -> 'ap_fixed<16,6>':
                   ```
                     def encode(xi):
@@ -52,13 +50,12 @@ def decode(yi):
 
         if encode is not None:
             X = encode(X)
-        with allocate(shape=X.shape, dtype=dtype) as input_buffer, \
-             allocate(shape=y_shape, dtype=dtype) as output_buffer:
+        with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer:
             input_buffer[:] = X
-            
+
             if profile:
                 timea = datetime.now()
-            
+
             self.axi_dma_0.sendchannel.transfer(input_buffer)
             self.axi_dma_0.recvchannel.transfer(output_buffer)
             if debug:
@@ -67,20 +64,20 @@ def decode(yi):
             if debug:
                 print("Send OK")
             self.axi_dma_0.recvchannel.wait()
-            
+
             if profile:
                 timeb = datetime.now()
-            
+
             if debug:
                 print("Receive OK")
-              
+
             result = output_buffer.copy()
-            
+
         if decode is not None:
             result = decode(result)
-            
+
         if profile:
             dts, rate = self._print_dt(timea, timeb, len(X))
             return result, dts, rate
 
-        return result
\ No newline at end of file
+        return result
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 389d687089..2a695d4e5a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -167,8 +167,9 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
 }
 
 #ifdef VITIS_ACCELERATOR_IP_FLOW
-//todo avoid hardcoding hls::axis<float, 0, 0, 0> and use template
-template <class srcType, typename dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
+// todo avoid hardcoding hls::axis<float, 0, 0, 0> and use template
+template <class srcType, typename dstType, size_t SIZE>
+void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
     for (size_t i = 0; i < SIZE; i++) {
         hls::axis<float, 0, 0, 0> ctype;
         ctype.data = dstType(src[i]);
@@ -176,7 +177,8 @@ template <class srcType, typename dstType, size_t SIZE> void convert_data(srcTyp
     }
 }
 
-template <typename srcType, class dstType, size_t SIZE> void convert_data(hls::stream<hls::axis<float, 0, 0, 0>> &src, dstType *dst) {
+template <typename srcType, class dstType, size_t SIZE>
+void convert_data(hls::stream<hls::axis<float, 0, 0, 0>> &src, dstType *dst) {
     for (size_t i = 0; i < SIZE; i++) {
         hls::axis<float, 0, 0, 0> ctype = src.read();
         dst[i] = dstType(ctype.data);
@@ -323,8 +325,7 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-template <class res_T, size_t SIZE,
-          typename std::enable_if<std::is_array<res_T>::value, int>::type = 0>
+template <class res_T, size_t SIZE, typename std::enable_if<std::is_array<res_T>::value, int>::type = 0>
 void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / res_T::size; i++) {
         res_T res_pack = result.read();
@@ -333,29 +334,29 @@ void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = fal
         }
         if (keep) {
             result.write(res_pack);
-        }           
+        }
     }
     out << std::endl;
 }
 
 // compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel
-template <class res_T, size_t SIZE,
-          typename std::enable_if<!std::is_array<res_T>::value, int>::type = 0>
+template <class res_T, size_t SIZE, typename std::enable_if<!std::is_array<res_T>::value, int>::type = 0>
 void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE; i++) {
         res_T res_pack = result.read();
-        
+
         out << res_pack.data << " ";
-        
+
         if (keep) {
             result.write(res_pack);
-        }           
+        }
     }
     out << std::endl;
 }
 
 // compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...> and io_stream
-template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+template <class underlying_res_T, class res_T, size_t SIZE>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
         res_T res_pack;
         for (int j = 0; j < underlying_res_T::size; j++) {
@@ -363,16 +364,15 @@ template <class underlying_res_T, class res_T, size_t SIZE> void print_result(hl
             out << res_pack.data << " ";
             if (keep) {
                 result.write(res_pack);
-            }   
-        }        
+            }
+        }
     }
     out << std::endl;
 }
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-template <class data_T, size_t SIZE,
-          typename std::enable_if<std::is_array<data_T>::value, int>::type = 0>
+template <class data_T, size_t SIZE, typename std::enable_if<std::is_array<data_T>::value, int>::type = 0>
 void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / data_T::size; i++) {
         data_T data_pack;
@@ -383,16 +383,14 @@ void fill_zero(hls::stream<data_T> &data) {
     }
 }
 
-template <class data_T, size_t SIZE,
-          typename std::enable_if<!std::is_array<data_T>::value, int>::type = 0>
+template <class data_T, size_t SIZE, typename std::enable_if<!std::is_array<data_T>::value, int>::type = 0>
 void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE; i++) {
         data_T data_pack;
         data_pack.data = 0.;
-        if (i==SIZE-1) {
+        if (i == SIZE - 1) {
             data_pack.last = 1;
-        }
-        else {
+        } else {
             data_pack.last = 0;
         }
         data.write(data_pack);
@@ -405,15 +403,13 @@ template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls
         data_T data_pack;
         for (int j = 0; j < underlying_data_T::size; j++) {
             data_pack.data = 0.;
-            if ((i==(SIZE / underlying_data_T::size-1)) && (j==(underlying_data_T::size-1))) {
+            if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) {
                 data_pack.last = 1;
-            }
-            else {
+            } else {
                 data_pack.last = 0;
             }
             data.write(data_pack);
         }
-        
     }
 }
 
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 70a2eabd75..31238b18c8 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,8 +1,8 @@
 from hls4ml.writer.catapult_writer import CatapultWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
-from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter
+from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.writers import Writer, get_writer, register_writer  # noqa: F401
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index 535a43b4bc..78e1fa982d 100644
--- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -40,8 +40,10 @@ def write_axi_wrapper(self, model):
                 newline += f'static const unsigned N_IN = {inp.size()};\n'
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                    newline += f'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
-                else: # TODO: handle this case
+                    newline += 'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
+                    # might need to make "float" a variable according to the
+                    # configuration set by the user and the DMA available data widths
+                else:  # TODO: handle this case
                     newline += f'typedef {inp_axi_t} input_axi_t;\n'
                     newline += f'typedef {out_axi_t} output_axi_t;\n'
             else:
@@ -68,9 +70,9 @@ def write_axi_wrapper(self, model):
                 newline = ''
                 if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                     newline += indent + 'bool is_last = false;\n'
-                if io_type == 'io_parallel': # TODO: handle io_parallel
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
                     newline += indent + inp.type.name + ' in_local[N_IN];\n'
-                    newline += indent + out.type.name + ' out_local[N_OUT];\n'             
+                    newline += indent + out.type.name + ' out_local[N_OUT];\n'
                     newline += indent + 'my_pkt tmp;\n'
                 elif io_type == 'io_stream':
                     newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
@@ -84,12 +86,12 @@ def write_axi_wrapper(self, model):
             elif '// hls-fpga-machine-learning insert call' in line:
                 newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
             elif '// hls-fpga-machine-learning insert interface' in line:
-                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite':  # TODO: handle axi_lite
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
-                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master':  # TODO: handle axi_master
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
                     newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
@@ -107,7 +109,7 @@ def write_axi_wrapper(self, model):
                         newline += indent + '#pragma HLS DATAFLOW\n'
             elif '// hls-fpga-machine-learning insert enqueue' in line:
                 io_type = model.config.get_config_value("IOType")
-                if io_type == 'io_parallel': # TODO: handle io_parallel
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
@@ -132,22 +134,10 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                        newline += (
-                            indent
-                            + indent
-                            + indent
-                            + 'in.read(tmp);\n'
-                        )
-                        newline += (
-                            indent
-                            + indent
-                            + indent
-                            + 'ctype[j] = tmp.data;\n'
-                        )
-                        newline += (
-                            indent + indent + indent + 'is_last = tmp.last;\n'
-                        )
-                    else: # TODO: handle this case
+                        newline += indent + indent + indent + 'in.read(tmp);\n'
+                        newline += indent + indent + indent + 'ctype[j] = tmp.data;\n'
+                        newline += indent + indent + indent + 'is_last = tmp.last;\n'
+                    else:  # TODO: handle this case
                         newline += (
                             indent
                             + indent
@@ -181,17 +171,11 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                        newline += (
-                            indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
-                        )
+                        newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
 
-                        newline += (
-                            indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
-                        )
+                        newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
 
-                        newline += (
-                            indent + indent + indent + 'out.write(tmp);\n'
-                        )
+                        newline += indent + indent + indent + 'out.write(tmp);\n'
                     else:
                         newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
                     newline += indent + indent + '}}\n'
@@ -253,7 +237,7 @@ def write_wrapper_test(self, model):
         ###################
         oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
         newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
-        
+
         inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
 
         f = open(oldfile)
@@ -278,19 +262,15 @@ def write_wrapper_test(self, model):
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-                newline = (
-                    line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
-                )
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = (
-                    line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
-                )
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
             else:
                 newline = line
             if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
                 if 'copy_data' in line:
                     newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
-                    
+
                 if io_type == 'io_stream':
                     if 'nnet::fill_zero' in line:
                         newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
@@ -331,7 +311,7 @@ def write_wrapper_test(self, model):
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t)
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)      
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)
             else:
                 newline = line
             fout.write(newline)

From 9d9e6454c195505068504ae0f6d84d9c39d418c1 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 20 Nov 2024 14:30:02 +0100
Subject: [PATCH 086/103] Remove unused file

---
 .../passes/fifo_depth_optimization.py         | 69 -------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
deleted file mode 100644
index e983ca49fb..0000000000
--- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# from hls4ml.backends.vivado.passes.fifo_depth_optimization import (
-#     generate_max_depth_file,
-#     get_vcd_data,
-#     populate_values,
-#     set_big_fifos,
-#     set_fifo_depth,
-# )
-# from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
-
-
-# class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
-#     def __init__(self):
-#         self.values = []
-
-#     def transform(self, model):
-#         # use `large_fifo_depth = 0` to keep the default fifo depth
-#         profiling_fifo_depth = getattr(self, 'profiling_fifo_depth', 100_000)
-
-#         # check axi-stream or io-stream, if not one the 2 exit
-#         if not (
-#             model.config.get_config_value('IOType') == 'io_stream'
-#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_stream'
-#             or model.config.get_config_value('AcceleratorConfig')['Interface'] == 'axi_master'
-#         ):
-#             raise Exception(
-#                 'To use this optimization you have to set `IOType` field to `io_stream` in the HLS config '
-#                 'or `axi_stream` or `axi_master` in `AcceleratorConfig` interface field'
-#             )
-
-#         # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be
-#         # profiled
-
-#         if profiling_fifo_depth:
-#             set_big_fifos(model.output_vars, profiling_fifo_depth)
-
-#         data = get_vcd_data(model)
-
-#         for i in range(1, len(data['children'][0]['children'][0]['children'])):
-#             # wrapper fifos
-#             populate_values(
-#                 self.values,
-#                 data['children'][0]['children'][0]['children'][i]['name'],
-#                 data['children'][0]['children'][0]['children'][i]['children'][0]['data'],
-#                 data['children'][0]['children'][0]['children'][i]['children'][1]['data'],
-#             )
-
-#         n_elem = len(data['children'][0]['children'][0]['children'][0]['children'])
-#         for i in range(n_elem):
-#             name = data['children'][0]['children'][0]['children'][0]['children'][i]['name']
-#             data_p = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data']
-#             depth = data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data']
-#             populate_values(self.values, name, data_p, depth)
-
-#         maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in self.values]
-
-#         generate_max_depth_file(model, maxs)
-
-#         set_fifo_depth(model, maxs)
-
-#         inp = model.get_input_variables()[0]
-#         out = model.get_output_variables()[0]
-#         for x in maxs:
-#             if 'in_local' in x['name']:
-#                 inp.pragma = (inp.pragma[0], x['max'] + 1)
-#             elif 'out_local' in x['name']:
-#                 out.pragma = (out.pragma[0], x['max'] + 1)
-
-#         print('[hls4ml] - FIFO optimization completed')
-#         return False

From 80697c0a8931d206eec5cf0b2f6bb3f918c99cee Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 20 Nov 2024 14:33:05 +0100
Subject: [PATCH 087/103] Remove unused xclbin generator

---
 .../vitis_accelerator_ip_flow_backend.py      | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index cd57df5a4a..0372a75b75 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -54,44 +54,6 @@ def build(
 
         return parse_vivado_report(model.config.get_output_dir())
 
-    # def make_xclbin(self, model, platform='xilinx_u250_xdma_201830_2'):
-    #     """Create the xclbin for the given model and target platform.
-
-    #     Args:
-    #         model (ModelGraph): Compiled and build model.
-    #         platform (str, optional): Development/Deployment target platform, must be installed first.
-    #             The host machine only requires the deployment target platform. Refer to the Getting Started section of
-    #             the Alveo guide. Defaults to 'xilinx_u250_xdma_201830_2'.
-    #     """
-    #     curr_dir = os.getcwd()
-    #     abs_path_dir = os.path.abspath(model.config.get_output_dir())
-    #     os.chdir(abs_path_dir)
-    #     os.makedirs('xo_files', exist_ok=True)
-    #     try:
-    #         os.system('vivado -mode batch -source design.tcl')
-    #     except Exception:
-    #         print("Something went wrong, check the Vivado logs")
-    #     project_name = model.config.get_project_name()
-    #     ip_repo_path = abs_path_dir + '/' + project_name + '_prj' + '/solution1/impl/ip'
-    #     os.makedirs('xclbin_files', exist_ok=True)
-    #     os.chdir(abs_path_dir + '/xclbin_files')
-    #     # TODO Add other platforms
-    #     vitis_cmd = (
-    #         "v++ -t hw --platform "
-    #         + platform
-    #         + " --link ../xo_files/"
-    #         + project_name
-    #         + "_kernel.xo -o'"
-    #         + project_name
-    #         + "_kernel.xclbin' --user_ip_repo_paths "
-    #         + ip_repo_path
-    #     )
-    #     try:
-    #         os.system(vitis_cmd)
-    #     except Exception:
-    #         print("Something went wrong, check the Vitis/Vivado logs")
-    #     os.chdir(curr_dir)
-
     def create_initial_config(
         self,
         board='pynq-z2',

From f46782919774f7d5298b0e7376e0eba2c87111b2 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 27 Nov 2024 09:58:00 +0100
Subject: [PATCH 088/103] Clean backends init

---
 hls4ml/backends/__init__.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 68562d75ed..ca3fff0e77 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -2,19 +2,16 @@
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (  # noqa: F401
-    VitisAcceleratorIPFlowConfig,
-)
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
-
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (  # isort: skip
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (
     VitisAcceleratorIPFlowBackend,
 )
-
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (
+    VitisAcceleratorIPFlowConfig,
+)
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
-
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
 
 register_backend('Vivado', VivadoBackend)

From 4c7455092a72d83ffb5f4f1234530e5c70e1453b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 27 Nov 2024 10:20:31 +0100
Subject: [PATCH 089/103] Fix backend import sequence

---
 hls4ml/backends/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index ca3fff0e77..7ba2ad4fbb 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,14 +5,14 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
+from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
+from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
 from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (
     VitisAcceleratorIPFlowBackend,
 )
 from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (
     VitisAcceleratorIPFlowConfig,
 )
-from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
-from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)

From 542b9508b20356f561c4ea29ebb3e58f6b5ab716 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 19 Feb 2025 14:39:19 +0100
Subject: [PATCH 090/103] Start cleaning up code

---
 .../supported_boards.json                     | 28 -------------------
 .../vitis_accelerator_ip_flow_backend.py      |  9 +-----
 .../vitis/nnet_utils/nnet_sepconv1d_stream.h  |  2 +-
 .../vitis/nnet_utils/nnet_sepconv2d_stream.h  |  2 +-
 .../myproject_axi.cpp                         |  2 +-
 .../vitis_accelerator_ip_flow/myproject_axi.h |  2 +-
 .../vivado/nnet_utils/nnet_sepconv1d_stream.h |  2 +-
 .../vivado/nnet_utils/nnet_sepconv2d_stream.h |  2 +-
 .../vitis_accelerator_ip_flow_writer.py       | 18 ++++++------
 9 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
index 1279ec22d0..4a54ea2924 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
@@ -10,33 +10,5 @@
     "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
     "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
     "c_drivers": {}
-  },
-  "alveo-u50": {
-    "part": "xcu50-fsvh2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u250": {
-    "part": "xcu250-figd2104-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u200": {
-    "part": "xcu200-fsgd2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u280": {
-    "part": "xcu280-fsvh2892-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
   }
 }
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 0372a75b75..872f6383e4 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -21,7 +21,7 @@ def build(
         validation=False,
         export=False,
         vsynth=False,
-        # fifo_opt=False,
+        fifo_opt=False,
         bitfile=False,
     ):
         # run the VitisBackend build
@@ -36,12 +36,7 @@ def build(
             vsynth=vsynth,
             # fifo_opt=fifo_opt,
         )
-        # Get Config to view Board and Platform
-        # from hls4ml.backends import VitisAcceleratorIPFlowConfig
 
-        # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
-        #     model.config, model.get_input_variables(), model.get_output_variables()
-        # )
         # now make a bitfile
         if bitfile:
             curr_dir = os.getcwd()
@@ -100,8 +95,6 @@ def create_initial_config(
         config['AcceleratorConfig']['Precision']['Output'] = {}
         config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
         config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
-        # if board.startswith('alveo'):
-        #     config['AcceleratorConfig']['Platform'] = platform
 
         return config
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
index aad5d9a430..20b6fecb49 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
index a119fb9e2a..a3747990e0 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
index cf6c0b9c25..1655ce506b 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
@@ -1,6 +1,6 @@
 // hls-fpga-machine-learning insert include
 
-void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out) {
+void myproject_axi(hls::stream<dma_data_packet> &in, hls::stream<dma_data_packet> &out) {
 
     // hls-fpga-machine-learning insert interface
 
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
index d0d88bfecf..1c019b5f10 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
@@ -6,5 +6,5 @@
 
 // hls-fpga-machine-learning insert definitions
 
-void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out);
+void myproject_axi(hls::stream<dma_data_packet> &in, hls::stream<dma_data_packet> &out);
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index 11622efbf0..ca3143d01e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -109,7 +109,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index f5cafd2ee7..7f4dd866c9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -133,7 +133,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index 78e1fa982d..4f96e38f33 100644
--- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -40,7 +40,7 @@ def write_axi_wrapper(self, model):
                 newline += f'static const unsigned N_IN = {inp.size()};\n'
                 newline += f'static const unsigned N_OUT = {out.size()};\n'
                 if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
-                    newline += 'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
+                    newline += 'typedef hls::axis<float, 0, 0, 0> dma_data_packet;\n'
                     # might need to make "float" a variable according to the
                     # configuration set by the user and the DMA available data widths
                 else:  # TODO: handle this case
@@ -73,7 +73,7 @@ def write_axi_wrapper(self, model):
                 if io_type == 'io_parallel':  # TODO: handle io_parallel
                     newline += indent + inp.type.name + ' in_local[N_IN];\n'
                     newline += indent + out.type.name + ' out_local[N_OUT];\n'
-                    newline += indent + 'my_pkt tmp;\n'
+                    newline += indent + 'dma_data_packet tmp;\n'
                 elif io_type == 'io_stream':
                     newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
                     newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
@@ -124,7 +124,7 @@ def write_axi_wrapper(self, model):
                     newline += indent + 'tmp.last = 0;\n'
                 elif io_type == 'io_stream':
                     newline = ''
-                    newline += indent + 'my_pkt tmp;\n'
+                    newline += indent + 'dma_data_packet tmp;\n'
 
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
@@ -252,19 +252,19 @@ def write_wrapper_test(self, model):
                 newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp() in line:
                 newline = line.replace(
-                    inp.definition_cpp(), 'hls::stream< my_pkt > inputs'
+                    inp.definition_cpp(), 'hls::stream< dma_data_packet > inputs'
                 )  # TODO instead of replacing strings, how about we use proper variables and their definition?
             elif out.definition_cpp() in line:
-                newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs')
+                newline = line.replace(out.definition_cpp(), 'hls::stream< dma_data_packet > outputs')
             elif 'unsigned short' in line:
                 newline = ''
             elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'dma_data_packet')
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'dma_data_packet')
             else:
                 newline = line
             if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
@@ -300,9 +300,9 @@ def write_wrapper_test(self, model):
             if f'{model.config.get_project_name()}.h' in line:
                 newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap')
+                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap')
             elif out.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap')
+                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap')
             elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(

From c78aec2d7c17346e7ff1806e6c4b994748c5cfdb Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Wed, 19 Feb 2025 14:59:22 +0100
Subject: [PATCH 091/103] Start integrating FIFO depth optimizer

---
 .../passes/fifo_depth_optimization.py         | 247 ++++++++++++++++++
 .../vitis_accelerator_ip_flow_backend.py      |  11 +-
 hls4ml/templates/vivado/build_prj.tcl         |   4 +
 .../vitis_accelerator_ip_flow_writer.py       |   6 +-
 .../test_optimization/test_fifo_depth.py      | 195 ++++++++++++++
 5 files changed, 453 insertions(+), 10 deletions(-)
 create mode 100644 hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
 create mode 100644 test/pytest/test_optimization/test_fifo_depth.py

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
new file mode 100644
index 0000000000..de7b61075e
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
@@ -0,0 +1,247 @@
+import json
+import os
+
+from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+def initialize_large_fifos(model, profiling_fifo_depth):
+    """Set all FIFO depths equal to a large value so that they can be profiled.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        profiling_fifo_depth (int): A large non-negative integer, must be larger than the max expected depth of the FIFOs.
+
+    Returns:
+        Dict[str, int]: A dictionary containing FIFO names as keys and their initial depths as values is returned for
+        comparison with the optimized depths.
+    """
+
+    # filter all the output variables and keep only the internal FIFOs, excluding output objects that are not FIFOs and the
+    # input and output FIFOs as they can't be profiled and are implementation dependant i.e AXI Stream, AXI Master or
+    # connected to another IP
+    vars_to_profile = {
+        output_variable_name: output_variable
+        for output_variable_name, output_variable in model.output_vars.items()
+        if ("VivadoStreamVariable" in str(type(output_variable)))
+        and output_variable != model.get_output_variables()[0]
+        and output_variable != model.get_input_variables()[0]
+    }
+
+    # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs and so
+    # they will be profiled. Alternatively, "config_dataflow -override_user_fifo_depth profiling_fifo_depth" can be
+    # used inside build_prj.tcl to override all FIFO depths with the specified value
+    initial_fifo_depths = {}
+    for output_variable in vars_to_profile.values():
+        if output_variable.pragma:
+            initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1])
+            output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth)
+    return initial_fifo_depths
+
+
+def override_test_bench(model):
+    """In order for the FIFO depth profiling to produce correct results, it is necessary for the cosimulation to
+    call the top function - Vitis IP at **least twice**. The test bench produced by the Vivado Writer is
+    overwritten by adding a for-loop over the top function.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+    """
+    indent = "    "
+    path_to_old_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp"
+    path_to_new_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_new_test.cpp"
+
+    newline = ""
+    second_part_of_testbench = False
+    with open(path_to_old_test_bench) as old_test_bench:
+        file_iterator = iter(old_test_bench)
+        for line in file_iterator:
+
+            if "// hls-fpga-machine-learning insert zero" in line:
+                newline += indent + indent + "const unsigned PROFILING_ITERATIONS = 2;\n"
+                newline += (
+                    indent
+                    + indent
+                    + "for(unsigned batch_iteration = 0; batch_iteration < PROFILING_ITERATIONS; ++batch_iteration) {\n"
+                )
+                newline += line
+                second_part_of_testbench = True
+            elif ("// hls-fpga-machine-learning insert tb-output" in line) and second_part_of_testbench:
+                newline += line
+                newline += next(file_iterator)
+                newline += indent + "}\n"
+            else:
+                newline += line
+
+    with open(path_to_new_test_bench, "w+") as new_test_bench:
+        new_test_bench.write(newline)
+
+    # replace the old test bench with the new test bench that includes a for-loop
+    os.replace(path_to_new_test_bench, path_to_old_test_bench)
+    return
+
+
+def execute_cosim_to_profile_fifos(model):
+    """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**,
+    to properly profile the max FIFO depths. The function will momentarily replace the initial test bench
+    with a suitable one for the optimization, and after the optimizer pass, the original test bench reinitialized.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+    """
+    model.write()
+
+    override_test_bench(model)
+
+    model.build(
+        reset=False,
+        csim=False,
+        synth=True,
+        cosim=True,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=True,
+    )
+
+    return
+
+
+def get_vitis_optimized_fifo_depths(model):
+    """Parse the files generated by the cosimulation to retrieve the optimized depths for the FIFOs.
+    Attention, only the FIFOs between the layers are profiled!
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+    Returns:
+        Dict[str, int]: A dictionary that contains the FIFO names as keys and the optimized depths as values.
+    """
+    # channel.zip is generated after the cosimulation and contains the chan_status*.csv files
+    # in the chan_status*.csv files the max depth achieved during cosimulation can be found at the last (4th) line
+    path_to_zip_file = (
+        model.config.get_output_dir()
+        + "/"
+        + model.config.get_project_name()
+        + "_prj"
+        + "/solution1/.autopilot/db/channel_depth_info/"
+    )
+
+    os.system(f"unzip -q -o {path_to_zip_file}channel.zip -d {path_to_zip_file}")
+
+    # the channel_info.csv file contains the mapping of each fifo name (i.e layer4_out_U) to the respective
+    # chan_status*.csv file
+    names_file_path = (
+        model.config.get_output_dir()
+        + "/"
+        + model.config.get_project_name()
+        + "_prj"
+        + "/solution1/.autopilot/db/channel_info.csv"
+    )
+
+    csv_fifo_depth_files = {}
+    with open(names_file_path) as names_file:
+        for line in names_file:
+            layer_name = line.split(",")[1]
+            csv_file_name = line.split(",")[3][:-1]
+            csv_fifo_depth_files[layer_name] = csv_file_name
+
+    optmized_fifo_depths = {}
+    for layer_name, file_name in csv_fifo_depth_files.items():
+        with open(path_to_zip_file + file_name) as chan_status_file:
+            lines = chan_status_file.readlines()
+            optmized_fifo_depths[layer_name[:-2]] = int(
+                lines[-1]
+            )  # remove "_U" from the layer name string and keep the last line of the file that contains the max depth
+
+    return optmized_fifo_depths
+
+
+def generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths):
+    """Generate a json file with the names of the FIFOs, the initial depths set by hls4ml and their optimized depths,
+    for post-processing. The json file is not used by the rest of the pipeline, it is only produced for the user.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        initial_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the initial
+        depths as values.
+        optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized
+        depths as values.
+    """
+    depths = {}
+    for fifo_name in initial_fifo_depths.keys():
+        depths[fifo_name] = {}
+        depths[fifo_name]['initial'] = initial_fifo_depths[fifo_name]
+        depths[fifo_name]['optimized'] = optimized_fifo_depths[fifo_name]
+
+    with open(model.config.get_output_dir() + "/fifo_depths.json", "w") as f:
+        json.dump(depths, f, indent=4)
+
+
+def set_optimized_fifo_depths(model, optimized_fifo_depths):
+    """Set the new optimized FIFO depths.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized
+        depths as values.
+    """
+
+    # iterate through the layer output FIFOs
+    for output_variable in model.output_vars.values():
+        if "VivadoStreamVariable" in str(type(output_variable)):
+            if output_variable.pragma:
+
+                if output_variable.name not in optimized_fifo_depths.keys():
+                    continue
+
+                filtered_depth = optimized_fifo_depths[output_variable.name]
+                output_variable.pragma = (output_variable.pragma[0], filtered_depth)
+    return
+
+
+class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+    def __init__(self):
+        pass
+
+    def transform(self, model):
+        """Perform FIFO depth optimization between the FIFOs of all layers to reduce resource utilization as the
+        initial FIFOs set by hls4ml might be larger than required. At the end of the optimization the FIFOs will
+        have the largest depths achieved during cosimulation without causing any deadlocks between the layers
+        (producer-consumer), thus no additional delays between the layers. In some cases, this optimization
+        might lead to bigger FIFOs than initially set by the hls4ml tool in order to prevent deadlocks.
+
+        Args:
+            model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+        Raises:
+            ValueError: If the FIFO depth for profiling provided by the user is not a non-negative integer.
+            RuntimeError: If the IO type is not set to "io_stream".
+
+        Returns:
+            bool: The execution state of the Optimzer Pass
+        """
+
+        # use `large_fifo_depth = 0` to keep the default fifo depth
+        # consider changing 100_000 either with a very very large value > of any total bram storage space
+        # or via vitis 2023.2 c-simulation
+        profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000)
+
+        if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0:
+            raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer.")
+
+        # check axi-stream or io-stream
+        if not (model.config.get_config_value("IOType") == "io_stream"):
+            raise RuntimeError("To use this optimization you have to set `IOType` field to `io_stream` in the HLS config.")
+
+        initial_fifo_depths = initialize_large_fifos(model, profiling_fifo_depth)
+
+        execute_cosim_to_profile_fifos(model)
+
+        optimized_fifo_depths = get_vitis_optimized_fifo_depths(model)
+
+        generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths)
+
+        set_optimized_fifo_depths(model, optimized_fifo_depths)
+
+        print("[hls4ml] - FIFO optimization completed")
+        return False
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 872f6383e4..08eeef0032 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -34,7 +34,6 @@ def build(
             validation=validation,
             export=export,
             vsynth=vsynth,
-            # fifo_opt=fifo_opt,
         )
 
         # now make a bitfile
@@ -59,8 +58,7 @@ def create_initial_config(
         interface='axi_stream',
         driver='python',
         input_type='float',
-        output_type='float',
-        platform='xilinx_u250_xdma_201830_2',
+        output_type='float'
     ):
         '''
         Create initial accelerator config with default parameters
@@ -110,6 +108,9 @@ def _register_flows(self):
         self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
         self._default_flow = vivado_ip
 
-        # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
+        # Register the fifo depth optimization flow which is different from the one for vivado
+        fifo_depth_opt_passes = [
+            'vitisacceleratoripflow:fifo_depth_optimization'
+        ] + writer_passes  # After optimization, a new project will be written
 
-        # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitisacceleratoripflow:ip'], backend=self.name)
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 50596091f2..9dbab5b9d6 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -196,6 +196,10 @@ if {$opt(cosim)} {
     if {$opt(fifo_opt)} {
         puts "\[hls4ml\] - FIFO optimization started"
         add_vcd_instructions_tcl
+
+        if {[string equal "$backend" "vivado"] || [string equal $backend "vivadoaccelerator"]} {
+            add_vcd_instructions_tcl
+        }
     }
 
     remove_recursive_log_wave
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index 4f96e38f33..977a6d6e04 100644
--- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -329,11 +329,6 @@ def write_board_script(self, model):
             os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()),
             f'{model.config.get_output_dir()}/design.tcl',
         )
-        # Generic alveo board
-        if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'):
-            src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir())
-            dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
-            copy_tree(src_dir, dst_dir)
 
         ###################
         # project.tcl
@@ -356,6 +351,7 @@ def write_board_script(self, model):
             f.write(f'set bit_width_hls_output {in_bit}\n')
             f.write(f'set bit_width_hls_input {out_bit}\n')
         f.close()
+        return
 
     def write_driver(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py
new file mode 100644
index 0000000000..1e99a7adc9
--- /dev/null
+++ b/test/pytest/test_optimization/test_fifo_depth.py
@@ -0,0 +1,195 @@
+import json
+import os
+import re
+from pathlib import Path
+
+import numpy as np
+import pytest
+import qonnx.core.onnx_exec as oxe
+from qonnx.core.modelwrapper import ModelWrapper
+from tensorflow.keras.layers import SeparableConv2D
+from tensorflow.keras.models import Sequential
+
+import hls4ml
+from hls4ml.backends.vitis.passes.fifo_depth_optimization import override_test_bench
+
+test_root_path = Path(__file__).parent
+example_model_path = (test_root_path / '../../../example-models').resolve()
+
+backend_options = ['Vitis']
+
+
+def parse_cosim_report(project_path):
+    """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is
+    detected.
+    """
+    prj_dir = None
+    top_func_name = None
+
+    project_tcl_path = project_path + '/project.tcl'
+
+    with open(project_tcl_path) as f:
+        for line in f.readlines():
+            if 'set project_name' in line:
+                top_func_name = line.split('"')[-2]
+                prj_dir = top_func_name + '_prj'
+
+    cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_cosim.rpt'
+
+    if os.path.isfile(cosim_file_path):
+        return cosim_file_path
+    else:
+        raise FileNotFoundError("Co-simulation report not found.")
+
+
+def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
+    """Execute the FIFO depth optimization sequence on a dummy Keras model."""
+
+    # create a keras model
+    input_shape = (128, 128, 3)
+    activation = 'relu'
+    kernel_size = (3, 3)
+    padding = 'same'
+
+    model = Sequential()
+    model.add(
+        SeparableConv2D(filters=4, kernel_size=kernel_size, padding=padding, activation=activation, input_shape=input_shape)
+    )
+    model.add(SeparableConv2D(filters=8, kernel_size=kernel_size, padding=padding, activation=activation))
+    model.compile(optimizer='adam', loss='mse')
+
+    X_input = np.random.rand(1, *input_shape)
+    keras_prediction = model.predict(X_input)
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>')
+
+    # include the FIFO Depth optimizer do the flows
+    config['Flows'] = ['vitis:fifo_depth_optimization']
+    hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure(
+        profiling_fifo_depth=profiling_fifo_depth
+    )
+
+    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_keras_backend_{backend}')
+
+    # execute fifo optimization
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend
+    )
+
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01)
+
+    # check that the FIFOs have been optimized succesfully
+    fifo_depth_optimization_checks(hls_model)
+
+
+def fifo_depth_optimization_checks(hls_model):
+    """Execute the FIFO depth optimization sequence on an hls4ml model."""
+
+    # force the top-function to execute twice in the cosimulation, to verify no deadlocks occur even
+    # when streaming multiple inputs into the network
+    override_test_bench(hls_model)
+
+    # build the new project with optimized depths and execute cosimulation to check for deadlocks
+    # due to the new FIFO depths
+    hls_model.build(reset=False, csim=False, synth=True, cosim=True)
+
+    # checks if the fifo depths decreased/were optimized
+    fifo_depths = {}
+    with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file:
+        fifo_depths = json.load(fifo_depths_file)
+
+    fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values())
+
+    # checks that the cosimulation ran succesfully without detecting deadlocks
+    cosim_report_path = parse_cosim_report(hls_model.config.get_output_dir())
+
+    with open(cosim_report_path) as cosim_report_file:
+        cosim_succesful = any("Pass" in line for line in cosim_report_file)
+
+    assert fifo_depths_decreased and cosim_succesful
+
+
+def expect_exception(error, message, backend, profiling_fifo_depth, io_type):
+    with pytest.raises(error, match=re.escape(message)):
+        run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type)
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"])
+def test_value_error(backend, profiling_fifo_depth):
+    """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised."""
+    message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer."
+    expect_exception(ValueError, message, backend, profiling_fifo_depth, io_type='io_stream')
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_runtime_error(backend):
+    """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised."""
+    message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config."
+    expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel')
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras(backend):
+    """Test the correct execution of the FIFO depth optimizer."""
+    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream')
+
+
+def get_branched_model():
+    """
+    Load branched model, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+    model = ModelWrapper(dl_file)
+    return model
+
+
+def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
+    """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
+
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
+    )
+
+    # add this line to remove the linear layer that quantizes the input of the NN
+    config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
+
+    config['Flows'] = ['vitis:fifo_depth_optimization']
+    hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure(
+        profiling_fifo_depth=profiling_fifo_depth
+    )
+
+    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=output_dir,
+        io_type=io_type,
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+    fifo_depth_optimization_checks(hls_model)
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_tiny_unet(backend):
+    """Test the correct execution of the FIFO depth optimizer."""
+    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())

From 62b5c277c328a04eea577925795b554662d1180d Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 20 Feb 2025 15:00:01 +0100
Subject: [PATCH 092/103] Fix FIFO depth optimizer

---
 hls4ml/backends/vitis/vitis_backend.py        |  24 ++++-
 .../passes/fifo_depth_optimization.py         |  18 +++-
 .../vitis_accelerator_ip_flow_backend.py      |   9 +-
 .../test_optimization/test_fifo_depth.py      | 102 +++++++++---------
 4 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 89484237f3..ff2104c795 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -75,7 +75,18 @@ def create_initial_config(
 
         return config
 
-    def build(self, model, reset=False, csim=True, synth=True, cosim=False, validation=False, export=False, vsynth=False):
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=True,
+        synth=True,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=False,
+    ):
         if 'linux' in sys.platform:
             found = os.system('command -v vitis_hls > /dev/null')
             if found != 0:
@@ -87,7 +98,16 @@ def build(self, model, reset=False, csim=True, synth=True, cosim=False, validati
             (
                 'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} '
                 'validation={validation} export={export} vsynth={vsynth}"'
-            ).format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth)
+            ).format(
+                reset=reset,
+                csim=csim,
+                synth=synth,
+                cosim=cosim,
+                validation=validation,
+                export=export,
+                vsynth=vsynth,
+                fifo_opt=fifo_opt,
+            )
         )
         os.chdir(curr_dir)
 
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
index de7b61075e..38706047a7 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
@@ -35,6 +35,14 @@ def initialize_large_fifos(model, profiling_fifo_depth):
         if output_variable.pragma:
             initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1])
             output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth)
+            
+    inp = model.get_input_variables()[0]
+    initial_fifo_depths['in_local'] = int(inp.pragma[1])
+    inp.pragma = (inp.pragma[0], profiling_fifo_depth)
+    
+    outp = model.get_output_variables()[0]
+    initial_fifo_depths['out_local'] = int(outp.pragma[1])
+    outp.pragma = (outp.pragma[0], profiling_fifo_depth)
     return initial_fifo_depths
 
 
@@ -188,7 +196,7 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths):
 
     # iterate through the layer output FIFOs
     for output_variable in model.output_vars.values():
-        if "VivadoStreamVariable" in str(type(output_variable)):
+        if ("VivadoStreamVariable" in str(type(output_variable))) or (output_variable.name == 'in_local') or (output_variable.name == 'out_local'):
             if output_variable.pragma:
 
                 if output_variable.name not in optimized_fifo_depths.keys():
@@ -196,6 +204,12 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths):
 
                 filtered_depth = optimized_fifo_depths[output_variable.name]
                 output_variable.pragma = (output_variable.pragma[0], filtered_depth)
+    
+    inp = model.get_input_variables()[0]
+    inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local'])
+    
+    outp = model.get_output_variables()[0]
+    outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local'])
     return
 
 
@@ -227,7 +241,7 @@ def transform(self, model):
         profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000)
 
         if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0:
-            raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer.")
+            raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a positive integer.")
 
         # check axi-stream or io-stream
         if not (model.config.get_config_value("IOType") == "io_stream"):
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 08eeef0032..c5dff6f789 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -34,6 +34,7 @@ def build(
             validation=validation,
             export=export,
             vsynth=vsynth,
+            fifo_opt=True,
         )
 
         # now make a bitfile
@@ -103,14 +104,14 @@ def get_writer_flow(self):
         return self._writer_flow
 
     def _register_flows(self):
-        vivado_ip = 'vivado:ip'
+        # vivado_ip = 'vivado:ip'
         writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
-        self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
-        self._default_flow = vivado_ip
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) 
+        # self._default_flow = vivado_ip
 
         # Register the fifo depth optimization flow which is different from the one for vivado
         fifo_depth_opt_passes = [
             'vitisacceleratoripflow:fifo_depth_optimization'
         ] + writer_passes  # After optimization, a new project will be written
 
-        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitisacceleratoripflow:ip'], backend=self.name)
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitis:ip'], backend=self.name)
diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py
index 1e99a7adc9..8589acbe51 100644
--- a/test/pytest/test_optimization/test_fifo_depth.py
+++ b/test/pytest/test_optimization/test_fifo_depth.py
@@ -5,18 +5,18 @@
 
 import numpy as np
 import pytest
-import qonnx.core.onnx_exec as oxe
-from qonnx.core.modelwrapper import ModelWrapper
+# import qonnx.core.onnx_exec as oxe
+# from qonnx.core.modelwrapper import ModelWrapper
 from tensorflow.keras.layers import SeparableConv2D
 from tensorflow.keras.models import Sequential
 
 import hls4ml
-from hls4ml.backends.vitis.passes.fifo_depth_optimization import override_test_bench
+from hls4ml.backends.vitis_accelerator_ip_flow.passes.fifo_depth_optimization import override_test_bench
 
 test_root_path = Path(__file__).parent
 example_model_path = (test_root_path / '../../../example-models').resolve()
 
-backend_options = ['Vitis']
+backend_options = ['VitisAcceleratorIPFlow']
 
 
 def parse_cosim_report(project_path):
@@ -34,7 +34,7 @@ def parse_cosim_report(project_path):
                 top_func_name = line.split('"')[-2]
                 prj_dir = top_func_name + '_prj'
 
-    cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_cosim.rpt'
+    cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt'
 
     if os.path.isfile(cosim_file_path):
         return cosim_file_path
@@ -46,7 +46,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
     """Execute the FIFO depth optimization sequence on a dummy Keras model."""
 
     # create a keras model
-    input_shape = (128, 128, 3)
+    input_shape = (32, 32, 3)
     activation = 'relu'
     kernel_size = (3, 3)
     padding = 'same'
@@ -64,8 +64,8 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
     config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>')
 
     # include the FIFO Depth optimizer do the flows
-    config['Flows'] = ['vitis:fifo_depth_optimization']
-    hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure(
+    config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+    hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
         profiling_fifo_depth=profiling_fifo_depth
     )
 
@@ -73,7 +73,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
 
     # execute fifo optimization
     hls_model = hls4ml.converters.convert_from_keras_model(
-        model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend
+        model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10
     )
 
     hls_model.compile()
@@ -134,62 +134,62 @@ def test_runtime_error(backend):
     expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel')
 
 
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
+# @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_dummy_keras(backend):
     """Test the correct execution of the FIFO depth optimizer."""
     run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream')
 
 
-def get_branched_model():
-    """
-    Load branched model, already channels-last and cleaned
-    """
-    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
-    assert os.path.isfile(dl_file)
-    model = ModelWrapper(dl_file)
-    return model
+# def get_branched_model():
+#     """
+#     Load branched model, already channels-last and cleaned
+#     """
+#     dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+#     assert os.path.isfile(dl_file)
+#     model = ModelWrapper(dl_file)
+#     return model
 
 
-def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
-    """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
+# def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
+#     """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
 
-    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
-    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
-    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
-    idict = {model.graph.input[0].name: X}
-    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+#     ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+#     X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+#     X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+#     idict = {model.graph.input[0].name: X}
+#     y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
 
-    config = hls4ml.utils.config.config_from_onnx_model(
-        model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
-    )
+#     config = hls4ml.utils.config.config_from_onnx_model(
+#         model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
+#     )
 
-    # add this line to remove the linear layer that quantizes the input of the NN
-    config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
+#     # add this line to remove the linear layer that quantizes the input of the NN
+#     config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
 
-    config['Flows'] = ['vitis:fifo_depth_optimization']
-    hls4ml.model.optimizer.get_optimizer('vitis:fifo_depth_optimization').configure(
-        profiling_fifo_depth=profiling_fifo_depth
-    )
+#     config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+#     hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+#         profiling_fifo_depth=profiling_fifo_depth
+#     )
 
-    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
+#     output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
 
-    hls_model = hls4ml.converters.convert_from_onnx_model(
-        model,
-        output_dir=output_dir,
-        io_type=io_type,
-        backend=backend,
-        hls_config=config,
-    )
-    hls_model.compile()
-    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
-    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+#     hls_model = hls4ml.converters.convert_from_onnx_model(
+#         model,
+#         output_dir=output_dir,
+#         io_type=io_type,
+#         backend=backend,
+#         hls_config=config,
+#     )
+#     hls_model.compile()
+#     y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+#     np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
 
-    fifo_depth_optimization_checks(hls_model)
+#     fifo_depth_optimization_checks(hls_model)
 
 
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
-@pytest.mark.parametrize('backend', backend_options)
-def test_successful_execution_of_tiny_unet(backend):
-    """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())
+# @pytest.mark.skip(reason='Skipping synthesis tests for now')
+# @pytest.mark.parametrize('backend', backend_options)
+# def test_successful_execution_of_tiny_unet(backend):
+#     """Test the correct execution of the FIFO depth optimizer."""
+#     run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())

From d5f2192bb217c68097e0ba58229e0efa1b8dd95e Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 20 Feb 2025 15:20:41 +0100
Subject: [PATCH 093/103] Run precommit

---
 hls4ml/backends/__init__.py                   |  5 +-
 hls4ml/backends/vitis/vitis_backend.py        |  2 +-
 .../passes/fifo_depth_optimization.py         | 14 +--
 .../vitis_accelerator_ip_flow_backend.py      |  4 +-
 .../vitis_accelerator_ip_flow_writer.py       | 22 +++--
 .../test_optimization/test_fifo_depth.py      | 86 +++++++++----------
 6 files changed, 74 insertions(+), 59 deletions(-)

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 7ba2ad4fbb..031c775c64 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,12 +5,13 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
+
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (  # isort: skip
     VitisAcceleratorIPFlowBackend,
 )
-from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (  # isort: skip  # noqa: F401
     VitisAcceleratorIPFlowConfig,
 )
 
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index ff2104c795..d1c094ca96 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -97,7 +97,7 @@ def build(
         os.system(
             (
                 'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} '
-                'validation={validation} export={export} vsynth={vsynth}"'
+                'validation={validation} export={export} vsynth={vsynth} fifo_opt={fifo_opt}"'
             ).format(
                 reset=reset,
                 csim=csim,
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
index 38706047a7..077d3683c5 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
@@ -35,11 +35,11 @@ def initialize_large_fifos(model, profiling_fifo_depth):
         if output_variable.pragma:
             initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1])
             output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth)
-            
+
     inp = model.get_input_variables()[0]
     initial_fifo_depths['in_local'] = int(inp.pragma[1])
     inp.pragma = (inp.pragma[0], profiling_fifo_depth)
-    
+
     outp = model.get_output_variables()[0]
     initial_fifo_depths['out_local'] = int(outp.pragma[1])
     outp.pragma = (outp.pragma[0], profiling_fifo_depth)
@@ -196,7 +196,11 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths):
 
     # iterate through the layer output FIFOs
     for output_variable in model.output_vars.values():
-        if ("VivadoStreamVariable" in str(type(output_variable))) or (output_variable.name == 'in_local') or (output_variable.name == 'out_local'):
+        if (
+            ("VivadoStreamVariable" in str(type(output_variable)))
+            or (output_variable.name == 'in_local')
+            or (output_variable.name == 'out_local')
+        ):
             if output_variable.pragma:
 
                 if output_variable.name not in optimized_fifo_depths.keys():
@@ -204,10 +208,10 @@ def set_optimized_fifo_depths(model, optimized_fifo_depths):
 
                 filtered_depth = optimized_fifo_depths[output_variable.name]
                 output_variable.pragma = (output_variable.pragma[0], filtered_depth)
-    
+
     inp = model.get_input_variables()[0]
     inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local'])
-    
+
     outp = model.get_output_variables()[0]
     outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local'])
     return
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index c5dff6f789..66411489fc 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -59,7 +59,7 @@ def create_initial_config(
         interface='axi_stream',
         driver='python',
         input_type='float',
-        output_type='float'
+        output_type='float',
     ):
         '''
         Create initial accelerator config with default parameters
@@ -106,7 +106,7 @@ def get_writer_flow(self):
     def _register_flows(self):
         # vivado_ip = 'vivado:ip'
         writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
-        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) 
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
         # self._default_flow = vivado_ip
 
         # Register the fifo depth optimization flow which is different from the one for vivado
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
index 977a6d6e04..9805c5b33f 100644
--- a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -1,8 +1,6 @@
 import os
-from distutils.dir_util import copy_tree
 from shutil import copyfile
 
-# from hls4ml.writer.vivado_writer import VivadoWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 
 
@@ -262,9 +260,17 @@ def write_wrapper_test(self, model):
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
-                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'dma_data_packet')
+                newline = (
+                    line.replace(inp.size_cpp(), 'N_IN')
+                    .replace(inp.name, 'inputs')
+                    .replace(inp.type.name, 'dma_data_packet')
+                )
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
-                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'dma_data_packet')
+                newline = (
+                    line.replace(out.size_cpp(), 'N_OUT')
+                    .replace(out.name, 'outputs')
+                    .replace(out.type.name, 'dma_data_packet')
+                )
             else:
                 newline = line
             if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
@@ -300,9 +306,13 @@ def write_wrapper_test(self, model):
             if f'{model.config.get_project_name()}.h' in line:
                 newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
             elif inp.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap')
+                newline = line.replace(
+                    inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap'
+                )
             elif out.definition_cpp(name_suffix='_ap') in line:
-                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap')
+                newline = line.replace(
+                    out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap'
+                )
             elif f'{model.config.get_project_name()}(' in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
                 newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_optimization/test_fifo_depth.py
index 8589acbe51..6e03e3bf0e 100644
--- a/test/pytest/test_optimization/test_fifo_depth.py
+++ b/test/pytest/test_optimization/test_fifo_depth.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 import pytest
-# import qonnx.core.onnx_exec as oxe
-# from qonnx.core.modelwrapper import ModelWrapper
+import qonnx.core.onnx_exec as oxe
+from qonnx.core.modelwrapper import ModelWrapper
 from tensorflow.keras.layers import SeparableConv2D
 from tensorflow.keras.models import Sequential
 
@@ -141,55 +141,55 @@ def test_successful_execution_of_dummy_keras(backend):
     run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream')
 
 
-# def get_branched_model():
-#     """
-#     Load branched model, already channels-last and cleaned
-#     """
-#     dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
-#     assert os.path.isfile(dl_file)
-#     model = ModelWrapper(dl_file)
-#     return model
+def get_branched_model():
+    """
+    Load branched model, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+    model = ModelWrapper(dl_file)
+    return model
 
 
-# def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
-#     """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
+def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
+    """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
 
-#     ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
-#     X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
-#     X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
-#     idict = {model.graph.input[0].name: X}
-#     y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
 
-#     config = hls4ml.utils.config.config_from_onnx_model(
-#         model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
-#     )
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
+    )
 
-#     # add this line to remove the linear layer that quantizes the input of the NN
-#     config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
+    # add this line to remove the linear layer that quantizes the input of the NN
+    config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
 
-#     config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
-#     hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
-#         profiling_fifo_depth=profiling_fifo_depth
-#     )
+    config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+    hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+        profiling_fifo_depth=profiling_fifo_depth
+    )
 
-#     output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
+    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
 
-#     hls_model = hls4ml.converters.convert_from_onnx_model(
-#         model,
-#         output_dir=output_dir,
-#         io_type=io_type,
-#         backend=backend,
-#         hls_config=config,
-#     )
-#     hls_model.compile()
-#     y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
-#     np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=output_dir,
+        io_type=io_type,
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
 
-#     fifo_depth_optimization_checks(hls_model)
+    fifo_depth_optimization_checks(hls_model)
 
 
-# @pytest.mark.skip(reason='Skipping synthesis tests for now')
-# @pytest.mark.parametrize('backend', backend_options)
-# def test_successful_execution_of_tiny_unet(backend):
-#     """Test the correct execution of the FIFO depth optimizer."""
-#     run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_tiny_unet(backend):
+    """Test the correct execution of the FIFO depth optimizer."""
+    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())

From 14b413e38ba618668cdfadc124ddf209e77ee00f Mon Sep 17 00:00:00 2001
From: Stelios Tzelepis <79508119+steltze@users.noreply.github.com>
Date: Fri, 21 Feb 2025 12:41:47 +0100
Subject: [PATCH 094/103] Update build_prj.tcl

---
 hls4ml/templates/vivado/build_prj.tcl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index cd398319c9..6018ba5171 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -195,7 +195,6 @@ if {$opt(cosim)} {
 
     if {$opt(fifo_opt)} {
         puts "\[hls4ml\] - FIFO optimization started"
-        add_vcd_instructions_tcl
 
         if {[string equal "$backend" "vivado"] || [string equal $backend "vivadoaccelerator"]} {
             add_vcd_instructions_tcl

From 9f1c8b3be3f8d8f5b5c779367c247a7857f8eb0e Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 6 Mar 2025 16:11:50 +0100
Subject: [PATCH 095/103] Address pr comments and merge main

---
 .../vitis_accelerator_ip_flow_backend.py                  | 4 ++--
 hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh   | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index 66411489fc..ab0f49f585 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -104,10 +104,10 @@ def get_writer_flow(self):
         return self._writer_flow
 
     def _register_flows(self):
-        # vivado_ip = 'vivado:ip'
+        vitis_ip = 'vitis:ip'
         writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
         self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
-        # self._default_flow = vivado_ip
+        self._default_flow = vitis_ip
 
         # Register the fifo depth optimization flow which is different from the one for vivado
         fifo_depth_opt_passes = [
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
index 262ce00d63..c8314badb0 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
@@ -13,9 +13,11 @@ INCFLAGS="-Ifirmware/ap_types/"
 
 PROJECT=myproject
 LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && pwd)"
+WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\""
 
-${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
-${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
-${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
 ${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
 rm -f *.o

From 47636924a2d26dcc2520caaf5f85199185598a2e Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 6 Mar 2025 17:36:45 +0100
Subject: [PATCH 096/103] Include tests without fifo optimization and checks
 for bitstream generation

---
 .../test_vitis_accelerator_ip_flow.py}        | 90 +++++++++++--------
 1 file changed, 54 insertions(+), 36 deletions(-)
 rename test/pytest/{test_optimization/test_fifo_depth.py => test_backend/test_vitis_accelerator_ip_flow.py} (66%)

diff --git a/test/pytest/test_optimization/test_fifo_depth.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
similarity index 66%
rename from test/pytest/test_optimization/test_fifo_depth.py
rename to test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index 6e03e3bf0e..f855793ecb 100644
--- a/test/pytest/test_optimization/test_fifo_depth.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -19,7 +19,7 @@
 backend_options = ['VitisAcceleratorIPFlow']
 
 
-def parse_cosim_report(project_path):
+def parse_cosim_report_and_search_for_bitstream(project_path):
     """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is
     detected.
     """
@@ -35,14 +35,21 @@ def parse_cosim_report(project_path):
                 prj_dir = top_func_name + '_prj'
 
     cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt'
-
-    if os.path.isfile(cosim_file_path):
-        return cosim_file_path
-    else:
+    bitsteam_path = project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit"
+    
+    cosim_report_exists = os.path.isfile(cosim_file_path)
+    bitstream_exists = os.path.isfile(bitsteam_path)
+    
+    if cosim_report_exists and bitstream_exists:
+        return cosim_file_path, bitstream_exists
+    elif not cosim_report_exists:
         raise FileNotFoundError("Co-simulation report not found.")
-
-
-def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
+    elif not bitstream_exists:
+        raise FileNotFoundError("Bitstream not found.")
+    else:
+        raise FileNotFoundError("Co-simulation report and Bitstream not found.")
+    
+def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization):
     """Execute the FIFO depth optimization sequence on a dummy Keras model."""
 
     # create a keras model
@@ -64,12 +71,13 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
     config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>')
 
     # include the FIFO Depth optimizer do the flows
-    config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
-    hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
-        profiling_fifo_depth=profiling_fifo_depth
-    )
+    if run_fifo_depth_optimization:
+        config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+        hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+            profiling_fifo_depth=profiling_fifo_depth
+        )
 
-    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_keras_backend_{backend}')
+    output_dir = str(test_root_path / f'hls4mlprj_vitis_accelerator_backend_{backend}')
 
     # execute fifo optimization
     hls_model = hls4ml.converters.convert_from_keras_model(
@@ -82,34 +90,32 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type):
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01)
 
     # check that the FIFOs have been optimized succesfully
-    fifo_depth_optimization_checks(hls_model)
+    build_and_check(hls_model, run_fifo_depth_optimization)
 
 
-def fifo_depth_optimization_checks(hls_model):
+def build_and_check(hls_model, run_fifo_depth_optimization):
     """Execute the FIFO depth optimization sequence on an hls4ml model."""
 
-    # force the top-function to execute twice in the cosimulation, to verify no deadlocks occur even
-    # when streaming multiple inputs into the network
-    override_test_bench(hls_model)
-
     # build the new project with optimized depths and execute cosimulation to check for deadlocks
     # due to the new FIFO depths
-    hls_model.build(reset=False, csim=False, synth=True, cosim=True)
+    hls_model.build(synth=True, csim=False, export=True, cosim=True, bitfile=True, vsynth=False)
 
     # checks if the fifo depths decreased/were optimized
-    fifo_depths = {}
-    with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file:
-        fifo_depths = json.load(fifo_depths_file)
+    fifo_depths_decreased = False
+    if run_fifo_depth_optimization:
+        fifo_depths = {}
+        with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file:
+            fifo_depths = json.load(fifo_depths_file)
 
-    fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values())
+        fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values())
 
-    # checks that the cosimulation ran succesfully without detecting deadlocks
-    cosim_report_path = parse_cosim_report(hls_model.config.get_output_dir())
+    # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated
+    cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir())
 
     with open(cosim_report_path) as cosim_report_file:
         cosim_succesful = any("Pass" in line for line in cosim_report_file)
 
-    assert fifo_depths_decreased and cosim_succesful
+    assert (fifo_depths_decreased or (not run_fifo_depth_optimization)) and cosim_succesful and bitstream_exists
 
 
 def expect_exception(error, message, backend, profiling_fifo_depth, io_type):
@@ -138,8 +144,13 @@ def test_runtime_error(backend):
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_dummy_keras(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream')
-
+    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False)
+    
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend):
+    """Test the correct execution of the FIFO depth optimizer."""
+    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True)
 
 def get_branched_model():
     """
@@ -151,7 +162,7 @@ def get_branched_model():
     return model
 
 
-def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model):
+def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization):
     """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
 
     ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
@@ -167,10 +178,11 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
     # add this line to remove the linear layer that quantizes the input of the NN
     config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
 
-    config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
-    hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
-        profiling_fifo_depth=profiling_fifo_depth
-    )
+    if run_fifo_depth_optimization:
+        config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+        hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+            profiling_fifo_depth=profiling_fifo_depth
+        )
 
     output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
 
@@ -185,11 +197,17 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
     y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
     np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
 
-    fifo_depth_optimization_checks(hls_model)
+    build_and_check(hls_model, run_fifo_depth_optimization)
 
 
 @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_tiny_unet(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model())
+    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=False)
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend):
+    """Test the correct execution of the FIFO depth optimizer."""
+    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=True)

From e66ad403dafc855c3ba62db649faf9c847238ee8 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 6 Mar 2025 17:52:34 +0100
Subject: [PATCH 097/103] Run precommit and remove unused override testbench

---
 .../passes/fifo_depth_optimization.py         | 44 -------------------
 .../test_vitis_accelerator_ip_flow.py         | 41 ++++++++++++-----
 2 files changed, 31 insertions(+), 54 deletions(-)

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
index 077d3683c5..4194ae3365 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
@@ -46,48 +46,6 @@ def initialize_large_fifos(model, profiling_fifo_depth):
     return initial_fifo_depths
 
 
-def override_test_bench(model):
-    """In order for the FIFO depth profiling to produce correct results, it is necessary for the cosimulation to
-    call the top function - Vitis IP at **least twice**. The test bench produced by the Vivado Writer is
-    overwritten by adding a for-loop over the top function.
-
-    Args:
-        model (ModelGraph): The model to which FIFO depth optimization is applied.
-    """
-    indent = "    "
-    path_to_old_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp"
-    path_to_new_test_bench = f"{model.config.get_output_dir()}/{model.config.get_project_name()}_new_test.cpp"
-
-    newline = ""
-    second_part_of_testbench = False
-    with open(path_to_old_test_bench) as old_test_bench:
-        file_iterator = iter(old_test_bench)
-        for line in file_iterator:
-
-            if "// hls-fpga-machine-learning insert zero" in line:
-                newline += indent + indent + "const unsigned PROFILING_ITERATIONS = 2;\n"
-                newline += (
-                    indent
-                    + indent
-                    + "for(unsigned batch_iteration = 0; batch_iteration < PROFILING_ITERATIONS; ++batch_iteration) {\n"
-                )
-                newline += line
-                second_part_of_testbench = True
-            elif ("// hls-fpga-machine-learning insert tb-output" in line) and second_part_of_testbench:
-                newline += line
-                newline += next(file_iterator)
-                newline += indent + "}\n"
-            else:
-                newline += line
-
-    with open(path_to_new_test_bench, "w+") as new_test_bench:
-        new_test_bench.write(newline)
-
-    # replace the old test bench with the new test bench that includes a for-loop
-    os.replace(path_to_new_test_bench, path_to_old_test_bench)
-    return
-
-
 def execute_cosim_to_profile_fifos(model):
     """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**,
     to properly profile the max FIFO depths. The function will momentarily replace the initial test bench
@@ -98,8 +56,6 @@ def execute_cosim_to_profile_fifos(model):
     """
     model.write()
 
-    override_test_bench(model)
-
     model.build(
         reset=False,
         csim=False,
diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index f855793ecb..b775efe634 100644
--- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -11,7 +11,6 @@
 from tensorflow.keras.models import Sequential
 
 import hls4ml
-from hls4ml.backends.vitis_accelerator_ip_flow.passes.fifo_depth_optimization import override_test_bench
 
 test_root_path = Path(__file__).parent
 example_model_path = (test_root_path / '../../../example-models').resolve()
@@ -35,11 +34,13 @@ def parse_cosim_report_and_search_for_bitstream(project_path):
                 prj_dir = top_func_name + '_prj'
 
     cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt'
-    bitsteam_path = project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit"
-    
+    bitsteam_path = (
+        project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit"
+    )
+
     cosim_report_exists = os.path.isfile(cosim_file_path)
     bitstream_exists = os.path.isfile(bitsteam_path)
-    
+
     if cosim_report_exists and bitstream_exists:
         return cosim_file_path, bitstream_exists
     elif not cosim_report_exists:
@@ -48,7 +49,8 @@ def parse_cosim_report_and_search_for_bitstream(project_path):
         raise FileNotFoundError("Bitstream not found.")
     else:
         raise FileNotFoundError("Co-simulation report and Bitstream not found.")
-    
+
+
 def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization):
     """Execute the FIFO depth optimization sequence on a dummy Keras model."""
 
@@ -144,13 +146,19 @@ def test_runtime_error(backend):
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_dummy_keras(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False)
-    
+    run_fifo_depth_optimization_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False
+    )
+
+
 @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_keras(backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True)
+    run_fifo_depth_optimization_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True
+    )
+
 
 def get_branched_model():
     """
@@ -204,10 +212,23 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_tiny_unet(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=False)
+    run_fifo_depth_optimization_onnx(
+        backend,
+        profiling_fifo_depth=200_000,
+        io_type='io_stream',
+        model=get_branched_model(),
+        run_fifo_depth_optimization=False,
+    )
+
 
 @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend):
     """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth=200_000, io_type='io_stream', model=get_branched_model(), run_fifo_depth_optimization=True)
+    run_fifo_depth_optimization_onnx(
+        backend,
+        profiling_fifo_depth=200_000,
+        io_type='io_stream',
+        model=get_branched_model(),
+        run_fifo_depth_optimization=True,
+    )

From f51be88830c02f21cf6db832047fd705d18ba4c7 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 7 Mar 2025 08:49:29 +0100
Subject: [PATCH 098/103] Fix qonnx test

---
 .../test_backend/test_vitis_accelerator_ip_flow.py       | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index b775efe634..6699f5570b 100644
--- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -100,7 +100,7 @@ def build_and_check(hls_model, run_fifo_depth_optimization):
 
     # build the new project with optimized depths and execute cosimulation to check for deadlocks
     # due to the new FIFO depths
-    hls_model.build(synth=True, csim=False, export=True, cosim=True, bitfile=True, vsynth=False)
+    hls_model.build(reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization)
 
     # checks if the fifo depths decreased/were optimized
     fifo_depths_decreased = False
@@ -142,7 +142,7 @@ def test_runtime_error(backend):
     expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel')
 
 
-# @pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_dummy_keras(backend):
     """Test the correct execution of the FIFO depth optimizer."""
@@ -200,6 +200,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
         io_type=io_type,
         backend=backend,
         hls_config=config,
+        part="xczu9eg-ffvb1156-2-e", 
+        board='zcu102',
+        clock_period=10
     )
     hls_model.compile()
     y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
@@ -221,7 +224,7 @@ def test_successful_execution_of_tiny_unet(backend):
     )
 
 
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
+# @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend):
     """Test the correct execution of the FIFO depth optimizer."""

From 85c233c38536ced1dfc86b3e0097100bca06a9eb Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 7 Mar 2025 10:05:56 +0100
Subject: [PATCH 099/103] Fix keras fifo optimization test

---
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl |  2 +-
 .../zcu102/tcl_scripts/axi_stream_design.tcl  |  2 +-
 .../test_vitis_accelerator_ip_flow.py         | 20 +++++++++++++------
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
index e8db1e6782..7db291fda6 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -63,7 +63,7 @@ reset_run impl_1
 reset_run synth_1
 #todo: make number of jobs a variable
 launch_runs impl_1 -to_step write_bitstream -jobs 10
-wait_on_run -timeout 360 impl_1
+wait_on_run -timeout 480 impl_1
 
 open_run impl_1
 report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
index 103fec0178..34f5468e7e 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -59,7 +59,7 @@ add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources
 reset_run impl_1
 reset_run synth_1
 launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
+wait_on_run -timeout 480 impl_1
 
 open_run impl_1
 report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index 6699f5570b..4b70589052 100644
--- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -55,7 +55,7 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru
     """Execute the FIFO depth optimization sequence on a dummy Keras model."""
 
     # create a keras model
-    input_shape = (32, 32, 3)
+    input_shape = (16, 16, 3)
     activation = 'relu'
     kernel_size = (3, 3)
     padding = 'same'
@@ -100,7 +100,9 @@ def build_and_check(hls_model, run_fifo_depth_optimization):
 
     # build the new project with optimized depths and execute cosimulation to check for deadlocks
     # due to the new FIFO depths
-    hls_model.build(reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization)
+    hls_model.build(
+        reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization
+    )
 
     # checks if the fifo depths decreased/were optimized
     fifo_depths_decreased = False
@@ -109,7 +111,13 @@ def build_and_check(hls_model, run_fifo_depth_optimization):
         with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file:
             fifo_depths = json.load(fifo_depths_file)
 
-        fifo_depths_decreased = all(fifo['optimized'] < fifo['initial'] for fifo in fifo_depths.values())
+        # omit checking for the input and output AXIS FIFOs as they are not always optimized
+        # as the last kernel e.g pointwise is faster than AXIS speed
+        fifo_depths_decreased = all(
+            fifo_depths['optimized'] < fifo_depths['initial']
+            for fifo_name, fifo_depths in fifo_depths.items()
+            if fifo_name not in {'in_local', 'out_local'}
+        )
 
     # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated
     cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir())
@@ -200,9 +208,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
         io_type=io_type,
         backend=backend,
         hls_config=config,
-        part="xczu9eg-ffvb1156-2-e", 
+        part="xczu9eg-ffvb1156-2-e",
         board='zcu102',
-        clock_period=10
+        clock_period=10,
     )
     hls_model.compile()
     y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
@@ -224,7 +232,7 @@ def test_successful_execution_of_tiny_unet(backend):
     )
 
 
-# @pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend):
     """Test the correct execution of the FIFO depth optimizer."""

From b91b6414fec78cccd16e7841fe1ba1b9af9f47f8 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 7 Mar 2025 14:20:33 +0100
Subject: [PATCH 100/103] Fix test documentation

---
 .../test_vitis_accelerator_ip_flow.py         | 130 ++++++++++--------
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index 4b70589052..9d370186a3 100644
--- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -20,7 +20,7 @@
 
 def parse_cosim_report_and_search_for_bitstream(project_path):
     """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is
-    detected.
+    detected and check if the bitstream was generated without errors.
     """
     prj_dir = None
     top_func_name = None
@@ -43,16 +43,18 @@ def parse_cosim_report_and_search_for_bitstream(project_path):
 
     if cosim_report_exists and bitstream_exists:
         return cosim_file_path, bitstream_exists
+    elif (not cosim_report_exists) and (not bitstream_exists):
+        raise FileNotFoundError("Co-simulation report and Bitstream not found.")
     elif not cosim_report_exists:
         raise FileNotFoundError("Co-simulation report not found.")
-    elif not bitstream_exists:
-        raise FileNotFoundError("Bitstream not found.")
     else:
-        raise FileNotFoundError("Co-simulation report and Bitstream not found.")
+        raise FileNotFoundError("Bitstream not found.")
 
 
-def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization):
-    """Execute the FIFO depth optimization sequence on a dummy Keras model."""
+def run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization):
+    """Execute bitstream generation on a dummy Keras model and the FIFO optimization sequence if
+    `run_fifo_depth_optimization` is set.
+    """
 
     # create a keras model
     input_shape = (16, 16, 3)
@@ -72,16 +74,17 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru
 
     config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>')
 
-    # include the FIFO Depth optimizer do the flows
+    # include the FIFO Depth optimizer do the flows if `run_fifo_depth_optimization` is set
     if run_fifo_depth_optimization:
         config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
         hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
             profiling_fifo_depth=profiling_fifo_depth
         )
 
-    output_dir = str(test_root_path / f'hls4mlprj_vitis_accelerator_backend_{backend}')
+    output_dir = str(
+        test_root_path / f'hls4mlprj_keras_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}'
+    )
 
-    # execute fifo optimization
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10
     )
@@ -91,15 +94,16 @@ def run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type, ru
 
     np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01)
 
-    # check that the FIFOs have been optimized succesfully
+    # build the hls4ml model and check if the bitstream was generated and the FIFOs were optimized if
+    # `run_fifo_depth_optimization` is set
     build_and_check(hls_model, run_fifo_depth_optimization)
 
 
 def build_and_check(hls_model, run_fifo_depth_optimization):
     """Execute the FIFO depth optimization sequence on an hls4ml model."""
 
-    # build the new project with optimized depths and execute cosimulation to check for deadlocks
-    # due to the new FIFO depths
+    # try to generate a bitstream. Use the optimized FIFO depths and execute cosimulation to check for deadlocks
+    # due to the new FIFO depths if `run_fifo_depth_optimization` is set
     hls_model.build(
         reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization
     )
@@ -130,47 +134,12 @@ def build_and_check(hls_model, run_fifo_depth_optimization):
 
 def expect_exception(error, message, backend, profiling_fifo_depth, io_type):
     with pytest.raises(error, match=re.escape(message)):
-        run_fifo_depth_optimization_keras(backend, profiling_fifo_depth, io_type)
-
-
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
-@pytest.mark.parametrize('backend', backend_options)
-@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"])
-def test_value_error(backend, profiling_fifo_depth):
-    """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised."""
-    message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer."
-    expect_exception(ValueError, message, backend, profiling_fifo_depth, io_type='io_stream')
-
-
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
-@pytest.mark.parametrize('backend', backend_options)
-def test_runtime_error(backend):
-    """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised."""
-    message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config."
-    expect_exception(RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel')
-
-
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
-@pytest.mark.parametrize('backend', backend_options)
-def test_successful_execution_of_dummy_keras(backend):
-    """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_keras(
-        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False
-    )
-
-
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
-@pytest.mark.parametrize('backend', backend_options)
-def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend):
-    """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_keras(
-        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True
-    )
+        run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type)
 
 
 def get_branched_model():
     """
-    Load branched model, already channels-last and cleaned
+    Load branched model, already channels-last and cleaned.
     """
     dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
     assert os.path.isfile(dl_file)
@@ -178,8 +147,10 @@ def get_branched_model():
     return model
 
 
-def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization):
-    """Execute the FIFO depth optimization sequence on a ONNX/QONNX model."""
+def run_bitstream_generation_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization):
+    """Execute bitstream generation on a QONNX branched model and the FIFO optimization sequence if
+    `run_fifo_depth_optimization` is set.
+    """
 
     ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
     X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
@@ -200,7 +171,9 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
             profiling_fifo_depth=profiling_fifo_depth
         )
 
-    output_dir = str(test_root_path / f'hls4mlprj_fifo_depth_optimization_branched_model_backend_{backend}')
+    output_dir = str(
+        test_root_path / f'hls4mlprj_branched_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}'
+    )
 
     hls_model = hls4ml.converters.convert_from_onnx_model(
         model,
@@ -221,9 +194,48 @@ def run_fifo_depth_optimization_onnx(backend, profiling_fifo_depth, io_type, mod
 
 @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
-def test_successful_execution_of_tiny_unet(backend):
-    """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(
+@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"])
+def test_value_error(backend, profiling_fifo_depth):
+    """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised."""
+    message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer."
+    expect_exception(
+        ValueError, message, backend, profiling_fifo_depth, io_type='io_stream', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_runtime_error(backend):
+    """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised."""
+    message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config."
+    expect_exception(
+        RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras(backend):
+    """Test the correct execution of the bitstream generation."""
+    run_bitstream_generation_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend):
+    """Test the correct execution of the bitstream generation with the FIFO depth optimizer."""
+    run_bitstream_generation_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_branched_model(backend):
+    """Test the correct execution of the bitstream generation."""
+    run_bitstream_generation_onnx(
         backend,
         profiling_fifo_depth=200_000,
         io_type='io_stream',
@@ -232,11 +244,11 @@ def test_successful_execution_of_tiny_unet(backend):
     )
 
 
-@pytest.mark.skip(reason='Skipping synthesis tests for now')
+# @pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
-def test_successful_execution_of_tiny_unet_with_fifo_optimization(backend):
-    """Test the correct execution of the FIFO depth optimizer."""
-    run_fifo_depth_optimization_onnx(
+def test_successful_execution_of_branched_model_with_fifo_optimization(backend):
+    """Test the correct execution of the bitstream generation with the FIFO depth optimizer."""
+    run_bitstream_generation_onnx(
         backend,
         profiling_fifo_depth=200_000,
         io_type='io_stream',

From 5bc54d3d08cb2f1c36e24734aa0aca13a5042d8b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 7 Mar 2025 16:36:19 +0100
Subject: [PATCH 101/103] Fix vivado project path in the build tcl for zcu102

---
 .../zcu102/tcl_scripts/axi_stream_design.tcl                | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
index 34f5468e7e..689186eb5f 100644
--- a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -2,7 +2,7 @@
 set tcldir [file dirname [info script]]
 source [file join $tcldir project.tcl]
 
-create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xczu9eg-ffvb1156-2-e -force
 
 set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
 set_property  ip_repo_paths  ${project_name}_prj [current_project]
@@ -52,9 +52,9 @@ connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins $
 
 apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
 
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
 
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
 
 reset_run impl_1
 reset_run synth_1

From 0a0d7d1a3db19521ef90a59bc714364e5082a264 Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Fri, 7 Mar 2025 18:48:16 +0100
Subject: [PATCH 102/103] Skip all tests

---
 test/pytest/test_backend/test_vitis_accelerator_ip_flow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
index 9d370186a3..00af95fce6 100644
--- a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -244,7 +244,7 @@ def test_successful_execution_of_branched_model(backend):
     )
 
 
-# @pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
 @pytest.mark.parametrize('backend', backend_options)
 def test_successful_execution_of_branched_model_with_fifo_optimization(backend):
     """Test the correct execution of the bitstream generation with the FIFO depth optimizer."""

From e55c52e43e839147c6b140cf91ff15eb45df613b Mon Sep 17 00:00:00 2001
From: steltze <stel.tze09@gmail.com>
Date: Thu, 13 Mar 2025 18:21:50 +0100
Subject: [PATCH 103/103] Link backend fifo optimization options

---
 .../vitis_accelerator_ip_flow_backend.py                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
index ab0f49f585..f1f16a1e83 100644
--- a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -34,7 +34,7 @@ def build(
             validation=validation,
             export=export,
             vsynth=vsynth,
-            fifo_opt=True,
+            fifo_opt=fifo_opt,
         )
 
         # now make a bitfile