diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 4a48f072cd..6c00f0aecb 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -8,12 +8,18 @@
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
 
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
-
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (  # isort: skip
+    VitisAcceleratorIPFlowBackend,
+)
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (  # isort: skip  # noqa: F401
+    VitisAcceleratorIPFlowConfig,
+)
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
+register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
new file mode 100644
index 0000000000..4194ae3365
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py
@@ -0,0 +1,221 @@
+import json
+import os
+
+from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+def initialize_large_fifos(model, profiling_fifo_depth):
+    """Set all FIFO depths equal to a large value so that they can be profiled.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        profiling_fifo_depth (int): A large non-negative integer, must be larger than the max expected depth of the FIFOs.
+
+    Returns:
+        Dict[str, int]: A dictionary containing FIFO names as keys and their initial depths as values is returned for
+        comparison with the optimized depths.
+    """
+
+    # filter all the output variables and keep only the internal FIFOs, excluding output objects that are not FIFOs and the
+    # input and output FIFOs as they can't be profiled and are implementation dependant i.e AXI Stream, AXI Master or
+    # connected to another IP
+    vars_to_profile = {
+        output_variable_name: output_variable
+        for output_variable_name, output_variable in model.output_vars.items()
+        if ("VivadoStreamVariable" in str(type(output_variable)))
+        and output_variable != model.get_output_variables()[0]
+        and output_variable != model.get_input_variables()[0]
+    }
+
+    # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs and so
+    # they will be profiled. Alternatively, "config_dataflow -override_user_fifo_depth profiling_fifo_depth" can be
+    # used inside build_prj.tcl to override all FIFO depths with the specified value
+    initial_fifo_depths = {}
+    for output_variable in vars_to_profile.values():
+        if output_variable.pragma:
+            initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1])
+            output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth)
+
+    inp = model.get_input_variables()[0]
+    initial_fifo_depths['in_local'] = int(inp.pragma[1])
+    inp.pragma = (inp.pragma[0], profiling_fifo_depth)
+
+    outp = model.get_output_variables()[0]
+    initial_fifo_depths['out_local'] = int(outp.pragma[1])
+    outp.pragma = (outp.pragma[0], profiling_fifo_depth)
+    return initial_fifo_depths
+
+
+def execute_cosim_to_profile_fifos(model):
+    """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**,
+    to properly profile the max FIFO depths. The function will momentarily replace the initial test bench
+    with a suitable one for the optimization, and after the optimizer pass, the original test bench reinitialized.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+    """
+    model.write()
+
+    model.build(
+        reset=False,
+        csim=False,
+        synth=True,
+        cosim=True,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=True,
+    )
+
+    return
+
+
+def get_vitis_optimized_fifo_depths(model):
+    """Parse the files generated by the cosimulation to retrieve the optimized depths for the FIFOs.
+    Attention, only the FIFOs between the layers are profiled!
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+    Returns:
+        Dict[str, int]: A dictionary that contains the FIFO names as keys and the optimized depths as values.
+    """
+    # channel.zip is generated after the cosimulation and contains the chan_status*.csv files
+    # in the chan_status*.csv files the max depth achieved during cosimulation can be found at the last (4th) line
+    path_to_zip_file = (
+        model.config.get_output_dir()
+        + "/"
+        + model.config.get_project_name()
+        + "_prj"
+        + "/solution1/.autopilot/db/channel_depth_info/"
+    )
+
+    os.system(f"unzip -q -o {path_to_zip_file}channel.zip -d {path_to_zip_file}")
+
+    # the channel_info.csv file contains the mapping of each fifo name (i.e layer4_out_U) to the respective
+    # chan_status*.csv file
+    names_file_path = (
+        model.config.get_output_dir()
+        + "/"
+        + model.config.get_project_name()
+        + "_prj"
+        + "/solution1/.autopilot/db/channel_info.csv"
+    )
+
+    csv_fifo_depth_files = {}
+    with open(names_file_path) as names_file:
+        for line in names_file:
+            layer_name = line.split(",")[1]
+            csv_file_name = line.split(",")[3][:-1]
+            csv_fifo_depth_files[layer_name] = csv_file_name
+
+    optmized_fifo_depths = {}
+    for layer_name, file_name in csv_fifo_depth_files.items():
+        with open(path_to_zip_file + file_name) as chan_status_file:
+            lines = chan_status_file.readlines()
+            optmized_fifo_depths[layer_name[:-2]] = int(
+                lines[-1]
+            )  # remove "_U" from the layer name string and keep the last line of the file that contains the max depth
+
+    return optmized_fifo_depths
+
+
+def generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths):
+    """Generate a json file with the names of the FIFOs, the initial depths set by hls4ml and their optimized depths,
+    for post-processing. The json file is not used by the rest of the pipeline, it is only produced for the user.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        initial_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the initial
+        depths as values.
+        optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized
+        depths as values.
+    """
+    depths = {}
+    for fifo_name in initial_fifo_depths.keys():
+        depths[fifo_name] = {}
+        depths[fifo_name]['initial'] = initial_fifo_depths[fifo_name]
+        depths[fifo_name]['optimized'] = optimized_fifo_depths[fifo_name]
+
+    with open(model.config.get_output_dir() + "/fifo_depths.json", "w") as f:
+        json.dump(depths, f, indent=4)
+
+
+def set_optimized_fifo_depths(model, optimized_fifo_depths):
+    """Set the new optimized FIFO depths.
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+        optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized
+        depths as values.
+    """
+
+    # iterate through the layer output FIFOs
+    for output_variable in model.output_vars.values():
+        if (
+            ("VivadoStreamVariable" in str(type(output_variable)))
+            or (output_variable.name == 'in_local')
+            or (output_variable.name == 'out_local')
+        ):
+            if output_variable.pragma:
+
+                if output_variable.name not in optimized_fifo_depths.keys():
+                    continue
+
+                filtered_depth = optimized_fifo_depths[output_variable.name]
+                output_variable.pragma = (output_variable.pragma[0], filtered_depth)
+
+    inp = model.get_input_variables()[0]
+    inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local'])
+
+    outp = model.get_output_variables()[0]
+    outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local'])
+    return
+
+
+class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+    def __init__(self):
+        pass
+
+    def transform(self, model):
+        """Perform FIFO depth optimization between the FIFOs of all layers to reduce resource utilization as the
+        initial FIFOs set by hls4ml might be larger than required. At the end of the optimization the FIFOs will
+        have the largest depths achieved during cosimulation without causing any deadlocks between the layers
+        (producer-consumer), thus no additional delays between the layers. In some cases, this optimization
+        might lead to bigger FIFOs than initially set by the hls4ml tool in order to prevent deadlocks.
+
+        Args:
+            model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+        Raises:
+            ValueError: If the FIFO depth for profiling provided by the user is not a non-negative integer.
+            RuntimeError: If the IO type is not set to "io_stream".
+
+        Returns:
+            bool: The execution state of the Optimzer Pass
+        """
+
+        # use `large_fifo_depth = 0` to keep the default fifo depth
+        # consider changing 100_000 either with a very very large value > of any total bram storage space
+        # or via vitis 2023.2 c-simulation
+        profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000)
+
+        if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0:
+            raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a positive integer.")
+
+        # check axi-stream or io-stream
+        if not (model.config.get_config_value("IOType") == "io_stream"):
+            raise RuntimeError("To use this optimization you have to set `IOType` field to `io_stream` in the HLS config.")
+
+        initial_fifo_depths = initialize_large_fifos(model, profiling_fifo_depth)
+
+        execute_cosim_to_profile_fifos(model)
+
+        optimized_fifo_depths = get_vitis_optimized_fifo_depths(model)
+
+        generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths)
+
+        set_optimized_fifo_depths(model, optimized_fifo_depths)
+
+        print("[hls4ml] - FIFO optimization completed")
+        return False
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
new file mode 100644
index 0000000000..4a54ea2924
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
@@ -0,0 +1,14 @@
+{
+  "pynq-z2": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  }
+}
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
new file mode 100644
index 0000000000..f1f16a1e83
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -0,0 +1,117 @@
+import os
+
+from hls4ml.backends import VitisBackend, VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisAcceleratorIPFlowBackend(VitisBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=True,
+        synth=True,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=False,
+        bitfile=False,
+    ):
+        # run the VitisBackend build
+        super().build(
+            model,
+            reset=reset,
+            csim=csim,
+            synth=synth,
+            cosim=cosim,
+            validation=validation,
+            export=export,
+            vsynth=vsynth,
+            fifo_opt=fifo_opt,
+        )
+
+        # now make a bitfile
+        if bitfile:
+            curr_dir = os.getcwd()
+            os.chdir(model.config.get_output_dir())
+            try:
+                os.system('vivado -mode batch -source design.tcl')  # check if this is accepted as a command
+            except Exception:
+                print("Something went wrong, check the Vivado logs")
+            os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
+
+    def create_initial_config(
+        self,
+        board='pynq-z2',
+        part=None,
+        clock_period=5,
+        clock_uncertainty='12.5%',
+        io_type='io_parallel',
+        interface='axi_stream',
+        driver='python',
+        input_type='float',
+        output_type='float',
+    ):
+        '''
+        Create initial accelerator config with default parameters
+
+        Args:
+            board: one of the keys defined in supported_boards.json
+            clock_period: clock period passed to hls project
+            io_type: io_parallel or io_stream
+            interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels.
+                       `axi_master`: generate hardware designs and drivers which exploit axi master channels.
+                       `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it
+                       to exchange large amount of data)
+            driver: `python`: generates the python driver to use the accelerator in the PYNQ stack.
+                    `c`: generates the c driver to use the accelerator bare-metal.
+            input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend
+                             will round the number of bits used to the next power-of-2 value.
+            output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
+                              VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+            platform: development target platform
+
+        Returns:
+            populated config
+        '''
+        board = board if board is not None else 'pynq-z2'
+        config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type)
+        config['AcceleratorConfig'] = {}
+        config['AcceleratorConfig']['Board'] = board
+        config['AcceleratorConfig']['Interface'] = interface  # axi_stream, axi_master, axi_lite
+        config['AcceleratorConfig']['Driver'] = driver
+        config['AcceleratorConfig']['Precision'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = {}
+        config['AcceleratorConfig']['Precision']['Output'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
+        config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
+
+        return config
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def _register_flows(self):
+        vitis_ip = 'vitis:ip'
+        writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
+        self._default_flow = vitis_ip
+
+        # Register the fifo depth optimization flow which is different from the one for vivado
+        fifo_depth_opt_passes = [
+            'vitisacceleratoripflow:fifo_depth_optimization'
+        ] + writer_passes  # After optimization, a new project will be written
+
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitis:ip'], backend=self.name)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
new file mode 100644
index 0000000000..07961a9b6f
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
@@ -0,0 +1,169 @@
+import json
+import os
+
+import numpy as np
+
+from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
+
+
+class VitisAcceleratorIPFlowConfig:
+    def __init__(self, config, model_inputs, model_outputs):
+        self.config = config.config
+        self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')
+        self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json'))
+        if self.board in self.supported_boards.keys():
+            board_info = self.supported_boards[self.board]
+            self.part = board_info['part']
+        else:
+            raise Exception('The board does not appear in supported_boards.json file')
+
+        if self.config.get('Part') is not None:
+            if self.config.get('Part') != self.part:
+                print(
+                    'WARNING: You set a Part that does not correspond to the Board you specified. The correct '
+                    'Part is now set.'
+                )
+                self.config['Part'] = self.part
+        accel_config = self.config.get('AcceleratorConfig', None)
+        if accel_config is not None:
+            prec = accel_config.get('Precision')
+            if prec is None:
+                raise Exception('Precision must be provided in the AcceleratorConfig')
+            else:
+                if prec.get('Input') is None or prec.get('Output') is None:
+                    raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision')
+        else:
+            accel_config = {
+                'Precision': {'Input': 'float', 'Output': 'float'},
+                'Driver': 'python',
+                'Interface': 'axi_stream',
+            }
+            config.config['AcceleratorConfig'] = accel_config
+
+        self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream')  # axi_stream, axi_master, axi_lite
+        self.driver = self.config['AcceleratorConfig'].get('Driver', 'python')  # python or c
+        self.input_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Input', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.output_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Output', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.platform = self.config['AcceleratorConfig'].get(
+            'Platform', 'xilinx_u250_xdma_201830_2'
+        )  # Get platform folder name
+
+        assert (
+            len(model_inputs) == 1
+        ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend"
+        assert (
+            len(model_outputs) == 1
+        ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend"
+        self.inp = model_inputs[0]
+        self.out = model_outputs[0]
+        inp_axi_t = self.input_type
+        out_axi_t = self.output_type
+
+        if inp_axi_t not in ['float', 'double']:
+            self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t))
+        if out_axi_t not in ['float', 'double']:
+            self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t))
+
+        if self.input_type == 'float':
+            self.input_bitwidth = 32
+        elif self.input_type == 'double':
+            self.input_bitwidth = 64
+        else:
+            self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width
+
+        if out_axi_t == 'float':
+            self.output_bitwidth = 32
+        elif out_axi_t == 'double':
+            self.output_bitwidth = 64
+        else:
+            self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width
+
+    def _next_factor8_type(self, p):
+        '''Return a new type with the width rounded to the next factor of 8 up to p's width
+        Args:
+            p : IntegerPrecisionType or FixedPrecisionType
+        Returns:
+            An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8
+            of p's width. Other parameters (fractional bits, extra modes) stay the same.
+        '''
+        W = p.width
+        newW = int(np.ceil(W / 8) * 8)
+        if isinstance(p, FixedPrecisionType):
+            return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits)
+        elif isinstance(p, IntegerPrecisionType):
+            return IntegerPrecisionType(newW, p.signed)
+
+    def get_io_bitwidth(self):
+        return self.input_bitwidth, self.output_bitwidth
+
+    def get_corrected_types(self):
+        return self.input_type, self.output_type, self.inp, self.out
+
+    def get_interface(self):
+        return self.interface
+
+    def get_board_info(self, board=None):
+        if board is None:
+            board = self.board
+        if board in self.supported_boards.keys():
+            return self.supported_boards[board]
+        else:
+            raise Exception('The board is still not supported')
+
+    def get_part(self):
+        return self.part
+
+    def get_driver(self):
+        return self.driver
+
+    def get_board(self):
+        return self.board
+
+    def get_platform(self):
+        return self.platform
+
+    def get_clock_period(self):
+        return self.clock_period
+
+    def get_driver_path(self):
+        if self.board.startswith('alveo'):
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+        else:
+            return (
+                '../templates/vitis_accelerator_ip_flow/'
+                + self.board
+                + '/'
+                + self.driver
+                + '_drivers/'
+                + self.get_driver_file()
+            )
+
+    def get_driver_file(self):
+        driver_ext = '.py' if self.driver == 'python' else '.h'
+        return self.interface + '_driver' + driver_ext
+
+    def get_krnl_rtl_src_dir(self):
+        return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src'
+
+    def get_input_type(self):
+        return self.input_type
+
+    def get_output_type(self):
+        return self.output_type
+
+    def get_tcl_file_path(self):
+        board_info = self.get_board_info(self.board)
+        tcl_scripts = board_info.get('tcl_scripts', None)
+        if tcl_scripts is None:
+            raise Exception('No tcl scripts definition available for the board in supported_board.json')
+        tcl_script = tcl_scripts.get(self.interface, None)
+        if tcl_script is None:
+            raise Exception('No tcl script definition available for the desired interface in supported_board.json')
+        if self.board.startswith('alveo'):
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+        else:
+            return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
new file mode 100644
index 0000000000..c8314badb0
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW"
+CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS"
+
+INCFLAGS="-Ifirmware/ap_types/"
+
+PROJECT=myproject
+LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && pwd)"
+WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\""
+
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
new file mode 100644
index 0000000000..1655ce506b
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
@@ -0,0 +1,14 @@
+// hls-fpga-machine-learning insert include
+
+void myproject_axi(hls::stream<dma_data_packet> &in, hls::stream<dma_data_packet> &out) {
+
+    // hls-fpga-machine-learning insert interface
+
+    // hls-fpga-machine-learning insert local vars
+
+    // hls-fpga-machine-learning insert enqueue
+
+    // hls-fpga-machine-learning insert call
+
+    // hls-fpga-machine-learning insert dequeue
+}
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
new file mode 100644
index 0000000000..1c019b5f10
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
@@ -0,0 +1,10 @@
+#ifndef MYPROJECT_AXI_H_
+#define MYPROJECT_AXI_H_
+
+#include <iostream>
+// hls-fpga-machine-learning insert include
+
+// hls-fpga-machine-learning insert definitions
+
+void myproject_axi(hls::stream<dma_data_packet> &in, hls::stream<dma_data_packet> &out);
+#endif
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1aac79f2d3
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(
+        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+    ):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..7db291fda6
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,69 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force
+
+# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list \
+  CONFIG.PCW_USE_S_AXI_HP0 {1} \
+  CONFIG.PCW_USE_S_AXI_HP2 {1} \
+] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+#todo: make clock a variable
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP2]
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins myproject_axi_0/ap_clk]
+endgroup
+
+validate_bd_design
+
+open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd}
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+#todo: make number of jobs a variable
+launch_runs impl_1 -to_step write_bitstream -jobs 10
+wait_on_run -timeout 480 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1d70e55406
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
@@ -0,0 +1,83 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import PL, Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def reset_PL():
+        PL.reset()
+
+    def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
+                    for sizing the output vector shape.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+
+        if encode is not None:
+            X = encode(X)
+        with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer:
+            input_buffer[:] = X
+
+            if profile:
+                timea = datetime.now()
+
+            self.axi_dma_0.sendchannel.transfer(input_buffer)
+            self.axi_dma_0.recvchannel.transfer(output_buffer)
+            if debug:
+                print("Transfer OK")
+            self.axi_dma_0.sendchannel.wait()
+            if debug:
+                print("Send OK")
+            self.axi_dma_0.recvchannel.wait()
+
+            if profile:
+                timeb = datetime.now()
+
+            if debug:
+                print("Receive OK")
+
+            result = output_buffer.copy()
+
+        if decode is not None:
+            result = decode(result)
+
+        if profile:
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return result, dts, rate
+
+        return result
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..689186eb5f
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,65 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_1]
+
+set_property -dict [list \
+  CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \
+  CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \
+  CONFIG.PSU__USE__S_AXI_GP2 {1} \
+  CONFIG.PSU__USE__S_AXI_GP4 {1} \
+] [get_bd_cells zynq_ultra_ps_e_1]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 480 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
new file mode 100755
index 0000000000..e01c8a8cd1
--- /dev/null
+++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
@@ -0,0 +1,441 @@
+// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689
+/*****************************************************************************
+ *
+ *     Author: Xilinx, Inc.
+ *
+ *     This text contains proprietary, confidential information of
+ *     Xilinx, Inc. , is distributed by under license from Xilinx,
+ *     Inc., and may be used, copied and/or disclosed only pursuant to
+ *     the terms of a valid license agreement with Xilinx, Inc.
+ *
+ *     XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS"
+ *     AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND
+ *     SOLUTIONS FOR XILINX DEVICES.  BY PROVIDING THIS DESIGN, CODE,
+ *     OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE,
+ *     APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION
+ *     THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT,
+ *     AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE
+ *     FOR YOUR IMPLEMENTATION.  XILINX EXPRESSLY DISCLAIMS ANY
+ *     WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE
+ *     IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR
+ *     REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF
+ *     INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE.
+ *
+ *     Xilinx products are not intended for use in life support appliances,
+ *     devices, or systems. Use in such applications is expressly prohibited.
+ *
+#-  (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+ *
+ *****************************************************************************/
+
+/*
+ * This file contains the definition of the data types for AXI streaming. 
+ * ap_axi_s is a signed interpretation of the AXI stream
+ * ap_axi_u is an unsigned interpretation of the AXI stream
+ */
+
+#ifndef __AP__AXI_SDATA__
+#define __AP__AXI_SDATA__
+
+#include <climits>
+#include "ap_int.h"
+//#include "ap_fixed.h"
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed;
+
+namespace hls {
+
+template <typename T> constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT;
+
+template <std::size_t W> constexpr std::size_t bitwidth<ap_int<W>> = W;
+template <std::size_t W> constexpr std::size_t bitwidth<ap_uint<W>> = W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+
+template <typename T>
+constexpr std::size_t bytewidth = (bitwidth<T> + CHAR_BIT - 1) / CHAR_BIT;
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest> struct axis {
+  static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser;
+  static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId;
+  static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest;
+  T data;
+  ap_uint<bytewidth<T>> keep;
+  ap_uint<bytewidth<T>> strb;
+  ap_uint<NewWUser> user;
+  ap_uint<1> last;
+  ap_uint<NewWId> id;
+  ap_uint<NewWDest> dest;
+
+  ap_uint<NewWUser> *get_user_ptr() { 
+#pragma HLS inline
+    return (WUser == 0) ? nullptr : &user;
+  }
+  ap_uint<NewWId> *get_id_ptr() {
+#pragma HLS inline
+    return (WId == 0) ? nullptr : &id;
+  }
+  ap_uint<NewWDest> *get_dest_ptr() {
+#pragma HLS inline
+    return (WDest == 0) ? nullptr : &dest;
+  }
+};
+
+} // namespace hls
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axis = hls::axis<ap_int<WData>, WUser, WId, WDest>;
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axiu = hls::axis<ap_uint<WData>, WUser, WId, WDest>;
+
+// Isolate out qdma_axis from hls::axis for special APIs.
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+struct qdma_axis;
+
+template <std::size_t WData> struct qdma_axis<WData, 0, 0, 0> {
+  //  private:
+  static constexpr std::size_t kBytes = (WData + 7) / 8;
+
+  ap_uint<WData> data;
+  ap_uint<kBytes> keep;
+  ap_uint<1> strb;
+  ap_uint<1> user;
+  ap_uint<1> last;
+  ap_uint<1> id;
+  ap_uint<1> dest;
+
+  ap_uint<1> *get_strb_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_user_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_id_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_dest_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+
+  //  public:
+  ap_uint<WData> get_data() const {
+#pragma HLS inline
+    return data;
+  }
+  ap_uint<kBytes> get_keep() const {
+#pragma HLS inline
+    return keep;
+  }
+  ap_uint<1> get_last() const {
+#pragma HLS inline
+    return last;
+  }
+
+  void set_data(const ap_uint<WData> &d) {
+#pragma HLS inline
+    data = d;
+  }
+  void set_keep(const ap_uint<kBytes> &k) {
+#pragma HLS inline
+    keep = k;
+  }
+  void set_last(const ap_uint<1> &l) {
+#pragma HLS inline
+    last = l;
+  }
+  void keep_all() {
+#pragma HLS inline
+    ap_uint<kBytes> k = 0;
+    keep = ~k;
+  }
+
+  qdma_axis() {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d) : data(d) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k) : data(d), keep(k) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k, ap_uint<1> l)
+      : data(d), keep(k), last(l) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(const qdma_axis<WData, 0, 0, 0> &d)
+      : data(d.data), keep(d.keep), last(d.last) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis &operator=(const qdma_axis<WData, 0, 0, 0> &d) {
+#pragma HLS inline
+    data = d.data;
+    keep = d.keep;
+    last = d.last;
+    return *this;
+  }
+};
+
+#ifdef AESL_SYN 
+#if ((__clang_major__ != 3) || (__clang_minor__ != 1))
+#include "hls_stream.h"
+namespace hls {
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest>
+class stream<axis<T, WUser, WId, WDest>> final {
+  typedef axis<T, WUser, WId, WDest> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+// specialization for qdma
+template <std::size_t WData>
+class stream<qdma_axis<WData, 0, 0, 0>> {
+  typedef qdma_axis<WData, 0, 0, 0> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                    &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                    &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+                    &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+} // namespace hls
+#endif
+#endif
+#endif
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 888c5f4c95..a1d0c6b774 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -210,7 +210,7 @@ if {$opt(cosim)} {
 
     set time_end [clock clicks -milliseconds]
     puts "INFO:"
-    if {[string equal "$backend" "vivadoaccelerator"]} {
+    if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
     } else {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index b8c2a48d19..2a695d4e5a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -11,6 +11,11 @@
 #include <stdlib.h>
 #include <vector>
 
+// this header cannot be included by Vivado HLS
+// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+#include "ap_axi_sdata.h"
+#endif
 namespace nnet {
 
 #ifndef __SYNTHESIS__
@@ -161,6 +166,26 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
     }
 }
 
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+// todo avoid hardcoding hls::axis<float, 0, 0, 0> and use template
+template <class srcType, typename dstType, size_t SIZE>
+void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<float, 0, 0, 0> ctype;
+        ctype.data = dstType(src[i]);
+        dst.write(ctype);
+    }
+}
+
+template <typename srcType, class dstType, size_t SIZE>
+void convert_data(hls::stream<hls::axis<float, 0, 0, 0>> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<float, 0, 0, 0> ctype = src.read();
+        dst[i] = dstType(ctype.data);
+    }
+}
+#endif
+
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
 extern size_t trace_type_size;
@@ -247,8 +272,6 @@ template <class data_T> void save_layer_output(hls::stream<data_T> &data, const
     }
 }
 
-#endif
-
 template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
     typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
     typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
@@ -272,14 +295,27 @@ void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
 }
 
 template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
+    for (auto i = 0; i < SIZE; i++) {
+        dst[i].data = src[i];
         if (i == SIZE - 1) {
-            dst[i].data = src[i];
             dst[i].last = 1;
         } else {
-            dst[i].data = src[i];
             dst[i].last = 0;
         }
+    }
+}
+
+template <class src_T, class dst_T, size_t SIZE> void copy_data_axi(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    for (auto i = 0; i < SIZE; i++) {
+        dst_T pack;
+        pack.data = src[i];
+        if (i == SIZE - 1) {
+            pack.last = 1;
+        } else {
+            pack.last = 0;
+        }
+        dst.write(pack);
+    }
 }
 
 template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
@@ -289,21 +325,55 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+template <class res_T, size_t SIZE, typename std::enable_if<std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / res_T::size; i++) {
         res_T res_pack = result.read();
         for (int j = 0; j < res_T::size; j++) {
             out << res_pack[j] << " ";
         }
-        if (keep)
+        if (keep) {
+            result.write(res_pack);
+        }
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel
+template <class res_T, size_t SIZE, typename std::enable_if<!std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        res_T res_pack = result.read();
+
+        out << res_pack.data << " ";
+
+        if (keep) {
             result.write(res_pack);
+        }
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...> and io_stream
+template <class underlying_res_T, class res_T, size_t SIZE>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
+        res_T res_pack;
+        for (int j = 0; j < underlying_res_T::size; j++) {
+            res_pack = result.read();
+            out << res_pack.data << " ";
+            if (keep) {
+                result.write(res_pack);
+            }
+        }
     }
     out << std::endl;
 }
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+template <class data_T, size_t SIZE, typename std::enable_if<std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / data_T::size; i++) {
         data_T data_pack;
         for (int j = 0; j < data_T::size; j++) {
@@ -313,6 +383,36 @@ template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
     }
 }
 
+template <class data_T, size_t SIZE, typename std::enable_if<!std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE; i++) {
+        data_T data_pack;
+        data_pack.data = 0.;
+        if (i == SIZE - 1) {
+            data_pack.last = 1;
+        } else {
+            data_pack.last = 0;
+        }
+        data.write(data_pack);
+    }
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
+template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < underlying_data_T::size; j++) {
+            data_pack.data = 0.;
+            if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) {
+                data_pack.last = 1;
+            } else {
+                data_pack.last = 0;
+            }
+            data.write(data_pack);
+        }
+    }
+}
+
 template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
     FILE *fp;
     fp = fopen(filename, "r");
@@ -370,6 +470,7 @@ template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &dat
         res << datareg;
     }
 }
+#endif
 
 constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
 
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 8de19fe1d2..0cc7d2b4b0 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -2,6 +2,7 @@
 from hls4ml.writer.oneapi_writer import OneAPIWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
+from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
@@ -10,6 +11,7 @@
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
+register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter)
 register_writer('Quartus', QuartusWriter)
 register_writer('oneAPI', OneAPIWriter)
 register_writer('Catapult', CatapultWriter)
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
new file mode 100644
index 0000000000..9805c5b33f
--- /dev/null
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -0,0 +1,393 @@
+import os
+from shutil import copyfile
+
+from hls4ml.writer.vitis_writer import VitisWriter
+
+
+class VitisAcceleratorIPFlowWriter(VitisWriter):
+    def __init__(self):
+        super().__init__()
+        self.vitis_accelerator_ip_flow_config = None
+
+    def write_axi_wrapper(self, model):
+        '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
+        Args:
+            model : The ModelGraph to write the wrapper for
+        '''
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+        indent = '    '
+
+        #######################
+        # myproject_axi.h
+        #######################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
+
+        for line in f.readlines():
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}.h"\n'
+                newline += '#include "ap_axi_sdata.h"\n'
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert definitions' in line:
+                newline = ''
+                newline += f'static const unsigned N_IN = {inp.size()};\n'
+                newline += f'static const unsigned N_OUT = {out.size()};\n'
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline += 'typedef hls::axis<float, 0, 0, 0> dma_data_packet;\n'
+                    # might need to make "float" a variable according to the
+                    # configuration set by the user and the DMA available data widths
+                else:  # TODO: handle this case
+                    newline += f'typedef {inp_axi_t} input_axi_t;\n'
+                    newline += f'typedef {out_axi_t} output_axi_t;\n'
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+        #######################
+        # myproject_axi.cpp
+        #######################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
+
+        io_type = model.config.get_config_value("IOType")
+
+        for line in f.readlines():
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
+            elif '// hls-fpga-machine-learning insert local vars' in line:
+                newline = ''
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline += indent + 'bool is_last = false;\n'
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
+                    newline += indent + inp.type.name + ' in_local[N_IN];\n'
+                    newline += indent + out.type.name + ' out_local[N_OUT];\n'
+                    newline += indent + 'dma_data_packet tmp;\n'
+                elif io_type == 'io_stream':
+                    newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
+                    newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
+                    newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+            elif '// hls-fpga-machine-learning insert call' in line:
+                newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
+            elif '// hls-fpga-machine-learning insert interface' in line:
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite':  # TODO: handle axi_lite
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master':  # TODO: handle axi_master
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE axis port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE axis port=out\n'
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    if model.config.get_config_value("IOType") == 'io_stream':
+                        newline += indent + '#pragma HLS DATAFLOW\n'
+            elif '// hls-fpga-machine-learning insert enqueue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'tmp = in.read(); // Read input with cast\n'
+                        newline += indent + indent + 'in_local[i] = tmp.data;\n'
+                        newline += indent + indent + 'is_last = tmp.last;\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'tmp.last = 0;\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'dma_data_packet tmp;\n'
+
+                    newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
+                    newline += indent + indent + '{input_t} ctype;\n'
+                    # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed
+                    newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + indent + 'in.read(tmp);\n'
+                        newline += indent + indent + indent + 'ctype[j] = tmp.data;\n'
+                        newline += indent + indent + indent + 'is_last = tmp.last;\n'
+                    else:  # TODO: handle this case
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+                        )
+                    newline += indent + indent + '}}\n'
+                    newline += indent + indent + 'in_local.write(ctype);\n'
+                    newline += indent + '}}\n'
+                    newline += indent + 'tmp.last = 0;\n'
+                    newline = newline.format(input_t=inp.type.name)
+            elif '// hls-fpga-machine-learning insert dequeue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':  # TODO: handle this case
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'tmp.data = out_local[i];\n'
+                        newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n'
+                        newline += indent + indent + 'out.write(tmp);\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
+                    newline += indent + '}\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n'
+                    newline += indent + indent + '{result_t} ctype = out_local.read();\n'
+                    newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
+
+                        newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
+
+                        newline += indent + indent + indent + 'out.write(tmp);\n'
+                    else:
+                        newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
+                    newline += indent + indent + '}}\n'
+                    newline += indent + '}}\n'
+                    newline = newline.format(result_t=out.type.name)
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def modify_build_script(self, model):
+        '''
+        Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        oldfile = f'{model.config.get_output_dir()}/build_prj.tcl'
+        newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl'
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if 'set_top' in line:
+                newline = line[:-1] + '_axi\n'  # remove the newline from the line end and append _axi for the new top
+                newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n'
+            elif f'{model.config.get_project_name()}_cosim' in line:
+                newline = line.replace(
+                    f'{model.config.get_project_name()}_cosim',
+                    f'{model.config.get_project_name()}_axi_cosim',
+                )
+            elif '${project_name}.tcl' in line:
+                newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl')
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # build_lib.sh
+        ###################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh'))
+        fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
+
+        for line in f.readlines():
+            line = line.replace('myproject', model.config.get_project_name())
+            line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+            fout.write(line)
+        f.close()
+        fout.close()
+
+    def write_wrapper_test(self, model):
+        ###################
+        # write myproject_test_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
+
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+        io_type = model.config.get_config_value("IOType")
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp() in line:
+                newline = line.replace(
+                    inp.definition_cpp(), 'hls::stream< dma_data_packet > inputs'
+                )  # TODO instead of replacing strings, how about we use proper variables and their definition?
+            elif out.definition_cpp() in line:
+                newline = line.replace(out.definition_cpp(), 'hls::stream< dma_data_packet > outputs')
+            elif 'unsigned short' in line:
+                newline = ''
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = (
+                    line.replace(inp.size_cpp(), 'N_IN')
+                    .replace(inp.name, 'inputs')
+                    .replace(inp.type.name, 'dma_data_packet')
+                )
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = (
+                    line.replace(out.size_cpp(), 'N_OUT')
+                    .replace(out.name, 'outputs')
+                    .replace(out.type.name, 'dma_data_packet')
+                )
+            else:
+                newline = line
+            if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                if 'copy_data' in line:
+                    newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
+
+                if io_type == 'io_stream':
+                    if 'nnet::fill_zero' in line:
+                        newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
+                        # indent = line.split('n')[0]
+                        # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
+                    if 'print_result' in line:
+                        newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # write myproject_bridge_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp'
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(
+                    inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap'
+                )
+            elif out.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(
+                    out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap'
+                )
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
+                    model.config.get_project_name(), inp.name, out.name
+                )
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t)
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+    def write_board_script(self, model):
+        '''
+        Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()),
+            f'{model.config.get_output_dir()}/design.tcl',
+        )
+
+        ###################
+        # project.tcl
+        ###################
+        f = open(f'{model.config.get_output_dir()}/project.tcl', 'w')
+        f.write('variable project_name\n')
+        f.write(f'set project_name "{model.config.get_project_name()}"\n')
+        f.write('variable backend\n')
+        f.write('set backend "vitisacceleratoripflow"\n')
+        f.write('variable part\n')
+        f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n')
+        f.write('variable clock_period\n')
+        f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
+        f.write('variable version\n')
+        f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+        if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+            in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth()
+            f.write(f'set bit_width_hls_output {in_bit}\n')
+            f.write(f'set bit_width_hls_input {out_bit}\n')
+        f.close()
+        return
+
+    def write_driver(self, model):
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()),
+            ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()),
+        )
+
+    def write_new_tar(self, model):
+        # os.remove(model.config.get_output_dir() + '.tar.gz')
+        super().write_tar(model)
+
+    def write_hls(self, model):
+        """
+        Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface
+        """
+        # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+        from hls4ml.backends import VitisAcceleratorIPFlowConfig
+
+        self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+        super().write_hls(model)
+        self.write_board_script(model)
+        self.write_driver(model)
+        self.write_wrapper_test(model)
+        self.write_axi_wrapper(model)
+        self.modify_build_script(model)
+        self.write_new_tar(model)
diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
new file mode 100644
index 0000000000..00af95fce6
--- /dev/null
+++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py
@@ -0,0 +1,257 @@
+import json
+import os
+import re
+from pathlib import Path
+
+import numpy as np
+import pytest
+import qonnx.core.onnx_exec as oxe
+from qonnx.core.modelwrapper import ModelWrapper
+from tensorflow.keras.layers import SeparableConv2D
+from tensorflow.keras.models import Sequential
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+example_model_path = (test_root_path / '../../../example-models').resolve()
+
+backend_options = ['VitisAcceleratorIPFlow']
+
+
+def parse_cosim_report_and_search_for_bitstream(project_path):
+    """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is
+    detected and check if the bitstream was generated without errors.
+    """
+    prj_dir = None
+    top_func_name = None
+
+    project_tcl_path = project_path + '/project.tcl'
+
+    with open(project_tcl_path) as f:
+        for line in f.readlines():
+            if 'set project_name' in line:
+                top_func_name = line.split('"')[-2]
+                prj_dir = top_func_name + '_prj'
+
+    cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt'
+    bitsteam_path = (
+        project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit"
+    )
+
+    cosim_report_exists = os.path.isfile(cosim_file_path)
+    bitstream_exists = os.path.isfile(bitsteam_path)
+
+    if cosim_report_exists and bitstream_exists:
+        return cosim_file_path, bitstream_exists
+    elif (not cosim_report_exists) and (not bitstream_exists):
+        raise FileNotFoundError("Co-simulation report and Bitstream not found.")
+    elif not cosim_report_exists:
+        raise FileNotFoundError("Co-simulation report not found.")
+    else:
+        raise FileNotFoundError("Bitstream not found.")
+
+
+def run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization):
+    """Execute bitstream generation on a dummy Keras model and the FIFO optimization sequence if
+    `run_fifo_depth_optimization` is set.
+    """
+
+    # create a keras model
+    input_shape = (16, 16, 3)
+    activation = 'relu'
+    kernel_size = (3, 3)
+    padding = 'same'
+
+    model = Sequential()
+    model.add(
+        SeparableConv2D(filters=4, kernel_size=kernel_size, padding=padding, activation=activation, input_shape=input_shape)
+    )
+    model.add(SeparableConv2D(filters=8, kernel_size=kernel_size, padding=padding, activation=activation))
+    model.compile(optimizer='adam', loss='mse')
+
+    X_input = np.random.rand(1, *input_shape)
+    keras_prediction = model.predict(X_input)
+
+    config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>')
+
+    # include the FIFO Depth optimizer do the flows if `run_fifo_depth_optimization` is set
+    if run_fifo_depth_optimization:
+        config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+        hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+            profiling_fifo_depth=profiling_fifo_depth
+        )
+
+    output_dir = str(
+        test_root_path / f'hls4mlprj_keras_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}'
+    )
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10
+    )
+
+    hls_model.compile()
+    hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape)
+
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01)
+
+    # build the hls4ml model and check if the bitstream was generated and the FIFOs were optimized if
+    # `run_fifo_depth_optimization` is set
+    build_and_check(hls_model, run_fifo_depth_optimization)
+
+
+def build_and_check(hls_model, run_fifo_depth_optimization):
+    """Execute the FIFO depth optimization sequence on an hls4ml model."""
+
+    # try to generate a bitstream. Use the optimized FIFO depths and execute cosimulation to check for deadlocks
+    # due to the new FIFO depths if `run_fifo_depth_optimization` is set
+    hls_model.build(
+        reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization
+    )
+
+    # checks if the fifo depths decreased/were optimized
+    fifo_depths_decreased = False
+    if run_fifo_depth_optimization:
+        fifo_depths = {}
+        with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file:
+            fifo_depths = json.load(fifo_depths_file)
+
+        # omit checking for the input and output AXIS FIFOs as they are not always optimized
+        # as the last kernel e.g pointwise is faster than AXIS speed
+        fifo_depths_decreased = all(
+            fifo_depths['optimized'] < fifo_depths['initial']
+            for fifo_name, fifo_depths in fifo_depths.items()
+            if fifo_name not in {'in_local', 'out_local'}
+        )
+
+    # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated
+    cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir())
+
+    with open(cosim_report_path) as cosim_report_file:
+        cosim_succesful = any("Pass" in line for line in cosim_report_file)
+
+    assert (fifo_depths_decreased or (not run_fifo_depth_optimization)) and cosim_succesful and bitstream_exists
+
+
+def expect_exception(error, message, backend, profiling_fifo_depth, io_type):
+    with pytest.raises(error, match=re.escape(message)):
+        run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type)
+
+
+def get_branched_model():
+    """
+    Load branched model, already channels-last and cleaned.
+    """
+    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+    model = ModelWrapper(dl_file)
+    return model
+
+
+def run_bitstream_generation_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization):
+    """Execute bitstream generation on a QONNX branched model and the FIFO optimization sequence if
+    `run_fifo_depth_optimization` is set.
+    """
+
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>'
+    )
+
+    # add this line to remove the linear layer that quantizes the input of the NN
+    config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>'
+
+    if run_fifo_depth_optimization:
+        config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization']
+        hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure(
+            profiling_fifo_depth=profiling_fifo_depth
+        )
+
+    output_dir = str(
+        test_root_path / f'hls4mlprj_branched_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}'
+    )
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=output_dir,
+        io_type=io_type,
+        backend=backend,
+        hls_config=config,
+        part="xczu9eg-ffvb1156-2-e",
+        board='zcu102',
+        clock_period=10,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+    build_and_check(hls_model, run_fifo_depth_optimization)
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"])
+def test_value_error(backend, profiling_fifo_depth):
+    """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised."""
+    message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer."
+    expect_exception(
+        ValueError, message, backend, profiling_fifo_depth, io_type='io_stream', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_runtime_error(backend):
+    """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised."""
+    message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config."
+    expect_exception(
+        RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras(backend):
+    """Test the correct execution of the bitstream generation."""
+    run_bitstream_generation_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend):
+    """Test the correct execution of the bitstream generation with the FIFO depth optimizer."""
+    run_bitstream_generation_keras(
+        backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_branched_model(backend):
+    """Test the correct execution of the bitstream generation."""
+    run_bitstream_generation_onnx(
+        backend,
+        profiling_fifo_depth=200_000,
+        io_type='io_stream',
+        model=get_branched_model(),
+        run_fifo_depth_optimization=False,
+    )
+
+
+@pytest.mark.skip(reason='Skipping synthesis tests for now')
+@pytest.mark.parametrize('backend', backend_options)
+def test_successful_execution_of_branched_model_with_fifo_optimization(backend):
+    """Test the correct execution of the bitstream generation with the FIFO depth optimizer."""
+    run_bitstream_generation_onnx(
+        backend,
+        profiling_fifo_depth=200_000,
+        io_type='io_stream',
+        model=get_branched_model(),
+        run_fifo_depth_optimization=True,
+    )