diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 4a48f072cd..6c00f0aecb 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -8,12 +8,18 @@ from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401 from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip - from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import ( # isort: skip + VitisAcceleratorIPFlowBackend, +) +from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import ( # isort: skip # noqa: F401 + VitisAcceleratorIPFlowConfig, +) register_backend('Vivado', VivadoBackend) register_backend('VivadoAccelerator', VivadoAcceleratorBackend) register_backend('Vitis', VitisBackend) +register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend) register_backend('Quartus', QuartusBackend) register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py new file mode 100644 index 0000000000..4194ae3365 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator_ip_flow/passes/fifo_depth_optimization.py @@ -0,0 +1,221 @@ +import json +import os + +from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass + + +def initialize_large_fifos(model, profiling_fifo_depth): + """Set all FIFO depths equal to a large value so that they can be profiled. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + profiling_fifo_depth (int): A large non-negative integer, must be larger than the max expected depth of the FIFOs. + + Returns: + Dict[str, int]: A dictionary containing FIFO names as keys and their initial depths as values is returned for + comparison with the optimized depths. + """ + + # filter all the output variables and keep only the internal FIFOs, excluding output objects that are not FIFOs and the + # input and output FIFOs as they can't be profiled and are implementation dependant i.e AXI Stream, AXI Master or + # connected to another IP + vars_to_profile = { + output_variable_name: output_variable + for output_variable_name, output_variable in model.output_vars.items() + if ("VivadoStreamVariable" in str(type(output_variable))) + and output_variable != model.get_output_variables()[0] + and output_variable != model.get_input_variables()[0] + } + + # initialize all the fifos to `profiling_fifo_depth` so that they will be automatically implemented in BRAMs and so + # they will be profiled. Alternatively, "config_dataflow -override_user_fifo_depth profiling_fifo_depth" can be + # used inside build_prj.tcl to override all FIFO depths with the specified value + initial_fifo_depths = {} + for output_variable in vars_to_profile.values(): + if output_variable.pragma: + initial_fifo_depths[output_variable.name] = int(output_variable.pragma[1]) + output_variable.pragma = (output_variable.pragma[0], profiling_fifo_depth) + + inp = model.get_input_variables()[0] + initial_fifo_depths['in_local'] = int(inp.pragma[1]) + inp.pragma = (inp.pragma[0], profiling_fifo_depth) + + outp = model.get_output_variables()[0] + initial_fifo_depths['out_local'] = int(outp.pragma[1]) + outp.pragma = (outp.pragma[0], profiling_fifo_depth) + return initial_fifo_depths + + +def execute_cosim_to_profile_fifos(model): + """Execute a cosimulation with a testh bench that calls the top function - Vitis IP at **least twice**, + to properly profile the max FIFO depths. The function will momentarily replace the initial test bench + with a suitable one for the optimization, and after the optimizer pass, the original test bench reinitialized. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + """ + model.write() + + model.build( + reset=False, + csim=False, + synth=True, + cosim=True, + validation=False, + export=False, + vsynth=False, + fifo_opt=True, + ) + + return + + +def get_vitis_optimized_fifo_depths(model): + """Parse the files generated by the cosimulation to retrieve the optimized depths for the FIFOs. + Attention, only the FIFOs between the layers are profiled! + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + + Returns: + Dict[str, int]: A dictionary that contains the FIFO names as keys and the optimized depths as values. + """ + # channel.zip is generated after the cosimulation and contains the chan_status*.csv files + # in the chan_status*.csv files the max depth achieved during cosimulation can be found at the last (4th) line + path_to_zip_file = ( + model.config.get_output_dir() + + "/" + + model.config.get_project_name() + + "_prj" + + "/solution1/.autopilot/db/channel_depth_info/" + ) + + os.system(f"unzip -q -o {path_to_zip_file}channel.zip -d {path_to_zip_file}") + + # the channel_info.csv file contains the mapping of each fifo name (i.e layer4_out_U) to the respective + # chan_status*.csv file + names_file_path = ( + model.config.get_output_dir() + + "/" + + model.config.get_project_name() + + "_prj" + + "/solution1/.autopilot/db/channel_info.csv" + ) + + csv_fifo_depth_files = {} + with open(names_file_path) as names_file: + for line in names_file: + layer_name = line.split(",")[1] + csv_file_name = line.split(",")[3][:-1] + csv_fifo_depth_files[layer_name] = csv_file_name + + optmized_fifo_depths = {} + for layer_name, file_name in csv_fifo_depth_files.items(): + with open(path_to_zip_file + file_name) as chan_status_file: + lines = chan_status_file.readlines() + optmized_fifo_depths[layer_name[:-2]] = int( + lines[-1] + ) # remove "_U" from the layer name string and keep the last line of the file that contains the max depth + + return optmized_fifo_depths + + +def generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths): + """Generate a json file with the names of the FIFOs, the initial depths set by hls4ml and their optimized depths, + for post-processing. The json file is not used by the rest of the pipeline, it is only produced for the user. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + initial_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the initial + depths as values. + optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized + depths as values. + """ + depths = {} + for fifo_name in initial_fifo_depths.keys(): + depths[fifo_name] = {} + depths[fifo_name]['initial'] = initial_fifo_depths[fifo_name] + depths[fifo_name]['optimized'] = optimized_fifo_depths[fifo_name] + + with open(model.config.get_output_dir() + "/fifo_depths.json", "w") as f: + json.dump(depths, f, indent=4) + + +def set_optimized_fifo_depths(model, optimized_fifo_depths): + """Set the new optimized FIFO depths. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + optmized_fifo_depths (Dict[str, int]): A dictionary that contains the FIFO names as keys and the optimized + depths as values. + """ + + # iterate through the layer output FIFOs + for output_variable in model.output_vars.values(): + if ( + ("VivadoStreamVariable" in str(type(output_variable))) + or (output_variable.name == 'in_local') + or (output_variable.name == 'out_local') + ): + if output_variable.pragma: + + if output_variable.name not in optimized_fifo_depths.keys(): + continue + + filtered_depth = optimized_fifo_depths[output_variable.name] + output_variable.pragma = (output_variable.pragma[0], filtered_depth) + + inp = model.get_input_variables()[0] + inp.pragma = (inp.pragma[0], optimized_fifo_depths['in_local']) + + outp = model.get_output_variables()[0] + outp.pragma = (inp.pragma[0], optimized_fifo_depths['out_local']) + return + + +class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass): + def __init__(self): + pass + + def transform(self, model): + """Perform FIFO depth optimization between the FIFOs of all layers to reduce resource utilization as the + initial FIFOs set by hls4ml might be larger than required. At the end of the optimization the FIFOs will + have the largest depths achieved during cosimulation without causing any deadlocks between the layers + (producer-consumer), thus no additional delays between the layers. In some cases, this optimization + might lead to bigger FIFOs than initially set by the hls4ml tool in order to prevent deadlocks. + + Args: + model (ModelGraph): The model to which FIFO depth optimization is applied. + + Raises: + ValueError: If the FIFO depth for profiling provided by the user is not a non-negative integer. + RuntimeError: If the IO type is not set to "io_stream". + + Returns: + bool: The execution state of the Optimzer Pass + """ + + # use `large_fifo_depth = 0` to keep the default fifo depth + # consider changing 100_000 either with a very very large value > of any total bram storage space + # or via vitis 2023.2 c-simulation + profiling_fifo_depth = getattr(self, "profiling_fifo_depth", 100_000) + + if not isinstance(profiling_fifo_depth, int) or profiling_fifo_depth <= 0: + raise ValueError("The FIFO depth for profiling (profiling_fifo_depth variable) must be a positive integer.") + + # check axi-stream or io-stream + if not (model.config.get_config_value("IOType") == "io_stream"): + raise RuntimeError("To use this optimization you have to set `IOType` field to `io_stream` in the HLS config.") + + initial_fifo_depths = initialize_large_fifos(model, profiling_fifo_depth) + + execute_cosim_to_profile_fifos(model) + + optimized_fifo_depths = get_vitis_optimized_fifo_depths(model) + + generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths) + + set_optimized_fifo_depths(model, optimized_fifo_depths) + + print("[hls4ml] - FIFO optimization completed") + return False diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json new file mode 100644 index 0000000000..4a54ea2924 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json @@ -0,0 +1,14 @@ +{ + "pynq-z2": { + "part": "xc7z020clg400-1", + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} + }, + "zcu102": { + "part": "xczu9eg-ffvb1156-2-e", + "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} + } +} diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py new file mode 100644 index 0000000000..f1f16a1e83 --- /dev/null +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py @@ -0,0 +1,117 @@ +import os + +from hls4ml.backends import VitisBackend, VivadoBackend +from hls4ml.model.flow import register_flow +from hls4ml.report import parse_vivado_report + + +class VitisAcceleratorIPFlowBackend(VitisBackend): + def __init__(self): + super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow') + self._register_layer_attributes() + self._register_flows() + + def build( + self, + model, + reset=False, + csim=True, + synth=True, + cosim=False, + validation=False, + export=False, + vsynth=False, + fifo_opt=False, + bitfile=False, + ): + # run the VitisBackend build + super().build( + model, + reset=reset, + csim=csim, + synth=synth, + cosim=cosim, + validation=validation, + export=export, + vsynth=vsynth, + fifo_opt=fifo_opt, + ) + + # now make a bitfile + if bitfile: + curr_dir = os.getcwd() + os.chdir(model.config.get_output_dir()) + try: + os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command + except Exception: + print("Something went wrong, check the Vivado logs") + os.chdir(curr_dir) + + return parse_vivado_report(model.config.get_output_dir()) + + def create_initial_config( + self, + board='pynq-z2', + part=None, + clock_period=5, + clock_uncertainty='12.5%', + io_type='io_parallel', + interface='axi_stream', + driver='python', + input_type='float', + output_type='float', + ): + ''' + Create initial accelerator config with default parameters + + Args: + board: one of the keys defined in supported_boards.json + clock_period: clock period passed to hls project + io_type: io_parallel or io_stream + interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels. + `axi_master`: generate hardware designs and drivers which exploit axi master channels. + `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it + to exchange large amount of data) + driver: `python`: generates the python driver to use the accelerator in the PYNQ stack. + `c`: generates the c driver to use the accelerator bare-metal. + input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend + will round the number of bits used to the next power-of-2 value. + output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note: + VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value. + platform: development target platform + + Returns: + populated config + ''' + board = board if board is not None else 'pynq-z2' + config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type) + config['AcceleratorConfig'] = {} + config['AcceleratorConfig']['Board'] = board + config['AcceleratorConfig']['Interface'] = interface # axi_stream, axi_master, axi_lite + config['AcceleratorConfig']['Driver'] = driver + config['AcceleratorConfig']['Precision'] = {} + config['AcceleratorConfig']['Precision']['Input'] = {} + config['AcceleratorConfig']['Precision']['Output'] = {} + config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed + config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed + + return config + + def get_default_flow(self): + return self._default_flow + + def get_writer_flow(self): + return self._writer_flow + + def _register_flows(self): + vitis_ip = 'vitis:ip' + writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) + self._default_flow = vitis_ip + + # Register the fifo depth optimization flow which is different from the one for vivado + fifo_depth_opt_passes = [ + 'vitisacceleratoripflow:fifo_depth_optimization' + ] + writer_passes # After optimization, a new project will be written + + register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitis:ip'], backend=self.name) diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py new file mode 100644 index 0000000000..07961a9b6f --- /dev/null +++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py @@ -0,0 +1,169 @@ +import json +import os + +import numpy as np + +from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType + + +class VitisAcceleratorIPFlowConfig: + def __init__(self, config, model_inputs, model_outputs): + self.config = config.config + self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2') + self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json')) + if self.board in self.supported_boards.keys(): + board_info = self.supported_boards[self.board] + self.part = board_info['part'] + else: + raise Exception('The board does not appear in supported_boards.json file') + + if self.config.get('Part') is not None: + if self.config.get('Part') != self.part: + print( + 'WARNING: You set a Part that does not correspond to the Board you specified. The correct ' + 'Part is now set.' + ) + self.config['Part'] = self.part + accel_config = self.config.get('AcceleratorConfig', None) + if accel_config is not None: + prec = accel_config.get('Precision') + if prec is None: + raise Exception('Precision must be provided in the AcceleratorConfig') + else: + if prec.get('Input') is None or prec.get('Output') is None: + raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision') + else: + accel_config = { + 'Precision': {'Input': 'float', 'Output': 'float'}, + 'Driver': 'python', + 'Interface': 'axi_stream', + } + config.config['AcceleratorConfig'] = accel_config + + self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream') # axi_stream, axi_master, axi_lite + self.driver = self.config['AcceleratorConfig'].get('Driver', 'python') # python or c + self.input_type = self.config['AcceleratorConfig']['Precision'].get( + 'Input', 'float' + ) # float, double or ap_fixed + self.output_type = self.config['AcceleratorConfig']['Precision'].get( + 'Output', 'float' + ) # float, double or ap_fixed + self.platform = self.config['AcceleratorConfig'].get( + 'Platform', 'xilinx_u250_xdma_201830_2' + ) # Get platform folder name + + assert ( + len(model_inputs) == 1 + ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend" + assert ( + len(model_outputs) == 1 + ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend" + self.inp = model_inputs[0] + self.out = model_outputs[0] + inp_axi_t = self.input_type + out_axi_t = self.output_type + + if inp_axi_t not in ['float', 'double']: + self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t)) + if out_axi_t not in ['float', 'double']: + self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t)) + + if self.input_type == 'float': + self.input_bitwidth = 32 + elif self.input_type == 'double': + self.input_bitwidth = 64 + else: + self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width + + if out_axi_t == 'float': + self.output_bitwidth = 32 + elif out_axi_t == 'double': + self.output_bitwidth = 64 + else: + self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width + + def _next_factor8_type(self, p): + '''Return a new type with the width rounded to the next factor of 8 up to p's width + Args: + p : IntegerPrecisionType or FixedPrecisionType + Returns: + An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8 + of p's width. Other parameters (fractional bits, extra modes) stay the same. + ''' + W = p.width + newW = int(np.ceil(W / 8) * 8) + if isinstance(p, FixedPrecisionType): + return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits) + elif isinstance(p, IntegerPrecisionType): + return IntegerPrecisionType(newW, p.signed) + + def get_io_bitwidth(self): + return self.input_bitwidth, self.output_bitwidth + + def get_corrected_types(self): + return self.input_type, self.output_type, self.inp, self.out + + def get_interface(self): + return self.interface + + def get_board_info(self, board=None): + if board is None: + board = self.board + if board in self.supported_boards.keys(): + return self.supported_boards[board] + else: + raise Exception('The board is still not supported') + + def get_part(self): + return self.part + + def get_driver(self): + return self.driver + + def get_board(self): + return self.board + + def get_platform(self): + return self.platform + + def get_clock_period(self): + return self.clock_period + + def get_driver_path(self): + if self.board.startswith('alveo'): + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file() + else: + return ( + '../templates/vitis_accelerator_ip_flow/' + + self.board + + '/' + + self.driver + + '_drivers/' + + self.get_driver_file() + ) + + def get_driver_file(self): + driver_ext = '.py' if self.driver == 'python' else '.h' + return self.interface + '_driver' + driver_ext + + def get_krnl_rtl_src_dir(self): + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src' + + def get_input_type(self): + return self.input_type + + def get_output_type(self): + return self.output_type + + def get_tcl_file_path(self): + board_info = self.get_board_info(self.board) + tcl_scripts = board_info.get('tcl_scripts', None) + if tcl_scripts is None: + raise Exception('No tcl scripts definition available for the board in supported_board.json') + tcl_script = tcl_scripts.get(self.interface, None) + if tcl_script is None: + raise Exception('No tcl script definition available for the desired interface in supported_board.json') + if self.board.startswith('alveo'): + return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script + else: + return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh new file mode 100644 index 0000000000..c8314badb0 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +CC=g++ +if [[ "$OSTYPE" == "linux-gnu" ]]; then + CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique" +elif [[ "$OSTYPE" == "darwin"* ]]; then + CFLAGS="-O3 -fPIC -std=c++11" +fi +VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW" +CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS" + +INCFLAGS="-Ifirmware/ap_types/" + +PROJECT=myproject +LIB_STAMP=mystamp +BASEDIR="$(cd "$(dirname "$0")" && pwd)" +WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\"" + +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o +${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o +${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so +rm -f *.o diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp new file mode 100644 index 0000000000..1655ce506b --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp @@ -0,0 +1,14 @@ +// hls-fpga-machine-learning insert include + +void myproject_axi(hls::stream &in, hls::stream &out) { + + // hls-fpga-machine-learning insert interface + + // hls-fpga-machine-learning insert local vars + + // hls-fpga-machine-learning insert enqueue + + // hls-fpga-machine-learning insert call + + // hls-fpga-machine-learning insert dequeue +} diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h new file mode 100644 index 0000000000..1c019b5f10 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h @@ -0,0 +1,10 @@ +#ifndef MYPROJECT_AXI_H_ +#define MYPROJECT_AXI_H_ + +#include +// hls-fpga-machine-learning insert include + +// hls-fpga-machine-learning insert definitions + +void myproject_axi(hls::stream &in, hls::stream &out); +#endif diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..1aac79f2d3 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from datetime import datetime + +import numpy as np +from pynq import Overlay, allocate + + +class NeuralNetworkOverlay(Overlay): + def __init__( + self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None + ): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = timeb - timea + dts = dt.seconds + dt.microseconds * 10**-6 + rate = N / dts + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..7db291fda6 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,69 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force + +# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] + +startgroup +set_property -dict [list \ + CONFIG.PCW_USE_S_AXI_HP0 {1} \ + CONFIG.PCW_USE_S_AXI_HP2 {1} \ +] [get_bd_cells processing_system7_0] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup + +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list \ + CONFIG.c_include_sg {0} \ + CONFIG.c_m_axi_mm2s_data_width {64} \ + CONFIG.c_m_axi_s2mm_data_width {64} \ + CONFIG.c_mm2s_burst_size {32} \ + CONFIG.c_sg_length_width {26} \ +] [get_bd_cells axi_dma_0] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 +endgroup + +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] + +#todo: make clock a variable +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP2] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins myproject_axi_0/ap_clk] +endgroup + +validate_bd_design + +open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd} + +make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +#todo: make number of jobs a variable +launch_runs impl_1 -to_step write_bitstream -jobs 10 +wait_on_run -timeout 480 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..1d70e55406 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py @@ -0,0 +1,83 @@ +from datetime import datetime + +import numpy as np +from pynq import PL, Overlay, allocate + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + + def _print_dt(self, timea, timeb, N): + dt = timeb - timea + dts = dt.seconds + dt.microseconds * 10**-6 + rate = N / dts + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") + return dts, rate + + def reset_PL(): + PL.reset() + + def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and + for sizing the output vector shape. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + + if encode is not None: + X = encode(X) + with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer: + input_buffer[:] = X + + if profile: + timea = datetime.now() + + self.axi_dma_0.sendchannel.transfer(input_buffer) + self.axi_dma_0.recvchannel.transfer(output_buffer) + if debug: + print("Transfer OK") + self.axi_dma_0.sendchannel.wait() + if debug: + print("Send OK") + self.axi_dma_0.recvchannel.wait() + + if profile: + timeb = datetime.now() + + if debug: + print("Receive OK") + + result = output_buffer.copy() + + if decode is not None: + result = decode(result) + + if profile: + dts, rate = self._print_dt(timea, timeb, len(X)) + return result, dts, rate + + return result diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..689186eb5f --- /dev/null +++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,65 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xczu9eg-ffvb1156-2-e -force + +set_property board_part xilinx.com:zcu102:part0:3.3 [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" +set_property ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project] +update_ip_catalog + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_1] + +set_property -dict [list \ + CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \ + CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \ + CONFIG.PSU__USE__S_AXI_GP2 {1} \ + CONFIG.PSU__USE__S_AXI_GP4 {1} \ +] [get_bd_cells zynq_ultra_ps_e_1] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup + +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list \ + CONFIG.c_include_sg {0} \ + CONFIG.c_m_axi_mm2s_data_width {64} \ + CONFIG.c_m_axi_s2mm_data_width {64} \ + CONFIG.c_mm2s_burst_size {32} \ + CONFIG.c_sg_length_width {26} \ +] [get_bd_cells axi_dma_0] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 +endgroup + +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] + +make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 480 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h new file mode 100755 index 0000000000..e01c8a8cd1 --- /dev/null +++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h @@ -0,0 +1,441 @@ +// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689 +/***************************************************************************** + * + * Author: Xilinx, Inc. + * + * This text contains proprietary, confidential information of + * Xilinx, Inc. , is distributed by under license from Xilinx, + * Inc., and may be used, copied and/or disclosed only pursuant to + * the terms of a valid license agreement with Xilinx, Inc. + * + * XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS" + * AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND + * SOLUTIONS FOR XILINX DEVICES. BY PROVIDING THIS DESIGN, CODE, + * OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE, + * APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION + * THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT, + * AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE + * FOR YOUR IMPLEMENTATION. XILINX EXPRESSLY DISCLAIMS ANY + * WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE + * IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR + * REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF + * INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. + * + * Xilinx products are not intended for use in life support appliances, + * devices, or systems. Use in such applications is expressly prohibited. + * +#- (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved. +#- +#- This file contains confidential and proprietary information +#- of Xilinx, Inc. and is protected under U.S. and +#- international copyright and other intellectual property +#- laws. +#- +#- DISCLAIMER +#- This disclaimer is not a license and does not grant any +#- rights to the materials distributed herewith. Except as +#- otherwise provided in a valid license issued to you by +#- Xilinx, and to the maximum extent permitted by applicable +#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND +#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES +#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING +#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- +#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and +#- (2) Xilinx shall not be liable (whether in contract or tort, +#- including negligence, or under any other theory of +#- liability) for any loss or damage of any kind or nature +#- related to, arising under or in connection with these +#- materials, including for any direct, or any indirect, +#- special, incidental, or consequential loss or damage +#- (including loss of data, profits, goodwill, or any type of +#- loss or damage suffered as a result of any action brought +#- by a third party) even if such damage or loss was +#- reasonably foreseeable or Xilinx had been advised of the +#- possibility of the same. +#- +#- CRITICAL APPLICATIONS +#- Xilinx products are not designed or intended to be fail- +#- safe, or for use in any application requiring fail-safe +#- performance, such as life-support or safety devices or +#- systems, Class III medical devices, nuclear facilities, +#- applications related to the deployment of airbags, or any +#- other applications that could lead to death, personal +#- injury, or severe property or environmental damage +#- (individually and collectively, "Critical +#- Applications"). Customer assumes the sole risk and +#- liability of any use of Xilinx products in Critical +#- Applications, subject only to applicable laws and +#- regulations governing limitations on product liability. +#- +#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS +#- PART OF THIS FILE AT ALL TIMES. +#- ************************************************************************ + + * + *****************************************************************************/ + +/* + * This file contains the definition of the data types for AXI streaming. + * ap_axi_s is a signed interpretation of the AXI stream + * ap_axi_u is an unsigned interpretation of the AXI stream + */ + +#ifndef __AP__AXI_SDATA__ +#define __AP__AXI_SDATA__ + +#include +#include "ap_int.h" +//#include "ap_fixed.h" +template +struct ap_fixed; +template +struct ap_ufixed; + +namespace hls { + +template constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT; + +template constexpr std::size_t bitwidth> = W; +template constexpr std::size_t bitwidth> = W; +template +constexpr std::size_t bitwidth> = _AP_W; +template +constexpr std::size_t bitwidth> = _AP_W; + +template +constexpr std::size_t bytewidth = (bitwidth + CHAR_BIT - 1) / CHAR_BIT; + +template struct axis { + static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser; + static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId; + static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest; + T data; + ap_uint> keep; + ap_uint> strb; + ap_uint user; + ap_uint<1> last; + ap_uint id; + ap_uint dest; + + ap_uint *get_user_ptr() { +#pragma HLS inline + return (WUser == 0) ? nullptr : &user; + } + ap_uint *get_id_ptr() { +#pragma HLS inline + return (WId == 0) ? nullptr : &id; + } + ap_uint *get_dest_ptr() { +#pragma HLS inline + return (WDest == 0) ? nullptr : &dest; + } +}; + +} // namespace hls + +template +using ap_axis = hls::axis, WUser, WId, WDest>; + +template +using ap_axiu = hls::axis, WUser, WId, WDest>; + +// Isolate out qdma_axis from hls::axis for special APIs. +template +struct qdma_axis; + +template struct qdma_axis { + // private: + static constexpr std::size_t kBytes = (WData + 7) / 8; + + ap_uint data; + ap_uint keep; + ap_uint<1> strb; + ap_uint<1> user; + ap_uint<1> last; + ap_uint<1> id; + ap_uint<1> dest; + + ap_uint<1> *get_strb_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_user_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_id_ptr() { +#pragma HLS inline + return nullptr; + } + ap_uint<1> *get_dest_ptr() { +#pragma HLS inline + return nullptr; + } + + // public: + ap_uint get_data() const { +#pragma HLS inline + return data; + } + ap_uint get_keep() const { +#pragma HLS inline + return keep; + } + ap_uint<1> get_last() const { +#pragma HLS inline + return last; + } + + void set_data(const ap_uint &d) { +#pragma HLS inline + data = d; + } + void set_keep(const ap_uint &k) { +#pragma HLS inline + keep = k; + } + void set_last(const ap_uint<1> &l) { +#pragma HLS inline + last = l; + } + void keep_all() { +#pragma HLS inline + ap_uint k = 0; + keep = ~k; + } + + qdma_axis() { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d) : data(d) { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d, ap_uint k) : data(d), keep(k) { +#pragma HLS inline + ; + } + qdma_axis(ap_uint d, ap_uint k, ap_uint<1> l) + : data(d), keep(k), last(l) { +#pragma HLS inline + ; + } + qdma_axis(const qdma_axis &d) + : data(d.data), keep(d.keep), last(d.last) { +#pragma HLS inline + ; + } + qdma_axis &operator=(const qdma_axis &d) { +#pragma HLS inline + data = d.data; + keep = d.keep; + last = d.last; + return *this; + } +}; + +#ifdef AESL_SYN +#if ((__clang_major__ != 3) || (__clang_minor__ != 1)) +#include "hls_stream.h" +namespace hls { + +template +class stream> final { + typedef axis __STREAM_T__; + +public: + /// Constructors + INLINE stream() {} + + INLINE stream(const char *name) { (void)name; } + + /// Make copy constructor and assignment operator private +private: + INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {} + +public: + /// Overload >> and << operators to implement read() and write() + INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); } + + INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); } + + /// empty & full + bool empty() { +#pragma HLS inline + bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + bool full() { +#pragma HLS inline + bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + /// Blocking read + void read(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + dout = tmp; + } + + __STREAM_T__ read() { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + return tmp; + } + + /// Blocking write + void write(const __STREAM_T__ &din) { +#pragma HLS inline + __STREAM_T__ tmp = din; + __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + } + + /// Non-Blocking read + bool read_nb(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) { + dout = tmp; + return true; + } else { + return false; + } + } + + /// Non-Blocking write + bool write_nb(const __STREAM_T__ &in) { +#pragma HLS inline + __STREAM_T__ tmp = in; + bool full_n = __fpga_axis_nb_push( + &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(), + V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + return full_n; + } + +private: + __STREAM_T__ V NO_CTOR; +}; + +// specialization for qdma +template +class stream> { + typedef qdma_axis __STREAM_T__; + +public: + /// Constructors + INLINE stream() {} + + INLINE stream(const char *name) { (void)name; } + + /// Make copy constructor and assignment operator private +private: + INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {} + +public: + /// Overload >> and << operators to implement read() and write() + INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); } + + INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); } + + /// empty & full + bool empty() { +#pragma HLS inline + bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + bool full() { +#pragma HLS inline + bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr()); + return !tmp; + } + + /// Blocking read + void read(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), + &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + dout = tmp; + } + + __STREAM_T__ read() { +#pragma HLS inline + __STREAM_T__ tmp; + __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + return tmp; + } + + /// Blocking write + void write(const __STREAM_T__ &din) { +#pragma HLS inline + __STREAM_T__ tmp = din; + __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep, + tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(), + tmp.get_dest_ptr()); + } + + /// Non-Blocking read + bool read_nb(__STREAM_T__ &dout) { +#pragma HLS inline + __STREAM_T__ tmp; + + if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, + V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, + &tmp.keep, &tmp.strb, tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) { + dout = tmp; + return true; + } else { + return false; + } + } + + /// Non-Blocking write + bool write_nb(const __STREAM_T__ &in) { +#pragma HLS inline + __STREAM_T__ tmp = in; + bool full_n = __fpga_axis_nb_push( + &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(), + V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(), + &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr()); + return full_n; + } + +private: + __STREAM_T__ V NO_CTOR; +}; + +} // namespace hls +#endif +#endif +#endif diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 888c5f4c95..a1d0c6b774 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -210,7 +210,7 @@ if {$opt(cosim)} { set time_end [clock clicks -milliseconds] puts "INFO:" - if {[string equal "$backend" "vivadoaccelerator"]} { + if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]] } else { puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]] diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h index b8c2a48d19..2a695d4e5a 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h @@ -11,6 +11,11 @@ #include #include +// this header cannot be included by Vivado HLS +// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files +#ifdef VITIS_ACCELERATOR_IP_FLOW +#include "ap_axi_sdata.h" +#endif namespace nnet { #ifndef __SYNTHESIS__ @@ -161,6 +166,26 @@ template void convert_data(hls::stre } } +#ifdef VITIS_ACCELERATOR_IP_FLOW +// todo avoid hardcoding hls::axis and use template +template +void convert_data(srcType *src, hls::stream> &dst) { + for (size_t i = 0; i < SIZE; i++) { + hls::axis ctype; + ctype.data = dstType(src[i]); + dst.write(ctype); + } +} + +template +void convert_data(hls::stream> &src, dstType *dst) { + for (size_t i = 0; i < SIZE; i++) { + hls::axis ctype = src.read(); + dst[i] = dstType(ctype.data); + } +} +#endif + extern bool trace_enabled; extern std::map *trace_outputs; extern size_t trace_type_size; @@ -247,8 +272,6 @@ template void save_layer_output(hls::stream &data, const } } -#endif - template void copy_data(std::vector src, dst_T dst[SIZE]) { typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET; typename std::vector::const_iterator in_end = in_begin + SIZE; @@ -272,14 +295,27 @@ void copy_data(std::vector src, hls::stream &dst) { } template void copy_data_axi(std::vector src, dst_T dst[SIZE]) { - for (auto i = 0; i < SIZE; i++) + for (auto i = 0; i < SIZE; i++) { + dst[i].data = src[i]; if (i == SIZE - 1) { - dst[i].data = src[i]; dst[i].last = 1; } else { - dst[i].data = src[i]; dst[i].last = 0; } + } +} + +template void copy_data_axi(std::vector src, hls::stream &dst) { + for (auto i = 0; i < SIZE; i++) { + dst_T pack; + pack.data = src[i]; + if (i == SIZE - 1) { + pack.last = 1; + } else { + pack.last = 0; + } + dst.write(pack); + } } template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) { @@ -289,21 +325,55 @@ template void print_result(res_T result[SIZE], std::o out << std::endl; } -template void print_result(hls::stream &result, std::ostream &out, bool keep = false) { +template ::value, int>::type = 0> +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { for (int i = 0; i < SIZE / res_T::size; i++) { res_T res_pack = result.read(); for (int j = 0; j < res_T::size; j++) { out << res_pack[j] << " "; } - if (keep) + if (keep) { + result.write(res_pack); + } + } + out << std::endl; +} + +// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel +template ::value, int>::type = 0> +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE; i++) { + res_T res_pack = result.read(); + + out << res_pack.data << " "; + + if (keep) { result.write(res_pack); + } + } + out << std::endl; +} + +// compatible with Vitis Accelerator for res_T = hls::axis and io_stream +template +void print_result(hls::stream &result, std::ostream &out, bool keep = false) { + for (int i = 0; i < SIZE / underlying_res_T::size; i++) { + res_T res_pack; + for (int j = 0; j < underlying_res_T::size; j++) { + res_pack = result.read(); + out << res_pack.data << " "; + if (keep) { + result.write(res_pack); + } + } } out << std::endl; } template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); } -template void fill_zero(hls::stream &data) { +template ::value, int>::type = 0> +void fill_zero(hls::stream &data) { for (int i = 0; i < SIZE / data_T::size; i++) { data_T data_pack; for (int j = 0; j < data_T::size; j++) { @@ -313,6 +383,36 @@ template void fill_zero(hls::stream &data) { } } +template ::value, int>::type = 0> +void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE; i++) { + data_T data_pack; + data_pack.data = 0.; + if (i == SIZE - 1) { + data_pack.last = 1; + } else { + data_pack.last = 0; + } + data.write(data_pack); + } +} + +// compatible with Vitis Accelerator for res_T = hls::axis +template void fill_zero(hls::stream &data) { + for (int i = 0; i < SIZE / underlying_data_T::size; i++) { + data_T data_pack; + for (int j = 0; j < underlying_data_T::size; j++) { + data_pack.data = 0.; + if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) { + data_pack.last = 1; + } else { + data_pack.last = 0; + } + data.write(data_pack); + } + } +} + template int read_file_1D(const char *filename, dataType data[nrows]) { FILE *fp; fp = fopen(filename, "r"); @@ -370,6 +470,7 @@ template void hls_stream_debug(hls::stream &dat res << datareg; } } +#endif constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); } diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 8de19fe1d2..0cc7d2b4b0 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -2,6 +2,7 @@ from hls4ml.writer.oneapi_writer import OneAPIWriter from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter +from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter from hls4ml.writer.vitis_writer import VitisWriter from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter from hls4ml.writer.vivado_writer import VivadoWriter @@ -10,6 +11,7 @@ register_writer('Vivado', VivadoWriter) register_writer('VivadoAccelerator', VivadoAcceleratorWriter) register_writer('Vitis', VitisWriter) +register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter) register_writer('Quartus', QuartusWriter) register_writer('oneAPI', OneAPIWriter) register_writer('Catapult', CatapultWriter) diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py new file mode 100644 index 0000000000..9805c5b33f --- /dev/null +++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py @@ -0,0 +1,393 @@ +import os +from shutil import copyfile + +from hls4ml.writer.vitis_writer import VitisWriter + + +class VitisAcceleratorIPFlowWriter(VitisWriter): + def __init__(self): + super().__init__() + self.vitis_accelerator_ip_flow_config = None + + def write_axi_wrapper(self, model): + '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces + Args: + model : The ModelGraph to write the wrapper for + ''' + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types() + indent = ' ' + + ####################### + # myproject_axi.h + ####################### + + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w') + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}.h"\n' + newline += '#include "ap_axi_sdata.h"\n' + elif 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert definitions' in line: + newline = '' + newline += f'static const unsigned N_IN = {inp.size()};\n' + newline += f'static const unsigned N_OUT = {out.size()};\n' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += 'typedef hls::axis dma_data_packet;\n' + # might need to make "float" a variable according to the + # configuration set by the user and the DMA available data widths + else: # TODO: handle this case + newline += f'typedef {inp_axi_t} input_axi_t;\n' + newline += f'typedef {out_axi_t} output_axi_t;\n' + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + ####################### + # myproject_axi.cpp + ####################### + + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp')) + fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w') + + io_type = model.config.get_config_value("IOType") + + for line in f.readlines(): + if 'myproject' in line: + newline = line.replace('myproject', model.config.get_project_name()) + elif '// hls-fpga-machine-learning insert include' in line: + newline = f'#include "{model.config.get_project_name()}_axi.h"\n' + elif '// hls-fpga-machine-learning insert local vars' in line: + newline = '' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + 'bool is_last = false;\n' + if io_type == 'io_parallel': # TODO: handle io_parallel + newline += indent + inp.type.name + ' in_local[N_IN];\n' + newline += indent + out.type.name + ' out_local[N_OUT];\n' + newline += indent + 'dma_data_packet tmp;\n' + elif io_type == 'io_stream': + newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' + newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' + newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format( + model.get_output_variables()[0].pragma[1] + ) + elif '// hls-fpga-machine-learning insert call' in line: + newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n' + elif '// hls-fpga-machine-learning insert interface' in line: + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite + newline = '' + newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' + newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n' + newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n' + elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master + newline = '' + newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n' + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format( + model.get_input_variables()[0].pragma[1] + ) + newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format( + model.get_output_variables()[0].pragma[1] + ) + elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline = '' + newline += indent + '#pragma HLS INTERFACE axis port=in\n' + newline += indent + '#pragma HLS INTERFACE axis port=out\n' + newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' + if model.config.get_config_value("IOType") == 'io_stream': + newline += indent + '#pragma HLS DATAFLOW\n' + elif '// hls-fpga-machine-learning insert enqueue' in line: + io_type = model.config.get_config_value("IOType") + if io_type == 'io_parallel': # TODO: handle io_parallel + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + 'tmp = in.read(); // Read input with cast\n' + newline += indent + indent + 'in_local[i] = tmp.data;\n' + newline += indent + indent + 'is_last = tmp.last;\n' + else: + newline += indent + indent + '#pragma HLS UNROLL\n' + newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n' + newline += indent + '}\n' + newline += indent + 'tmp.last = 0;\n' + elif io_type == 'io_stream': + newline = '' + newline += indent + 'dma_data_packet tmp;\n' + + newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' + # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed + newline += indent + indent + '{input_t} ctype;\n' + # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n' + # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed + newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' + # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + indent + indent + 'in.read(tmp);\n' + newline += indent + indent + indent + 'ctype[j] = tmp.data;\n' + newline += indent + indent + indent + 'is_last = tmp.last;\n' + else: # TODO: handle this case + newline += ( + indent + + indent + + indent + + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n' + ) + newline += indent + indent + '}}\n' + newline += indent + indent + 'in_local.write(ctype);\n' + newline += indent + '}}\n' + newline += indent + 'tmp.last = 0;\n' + newline = newline.format(input_t=inp.type.name) + elif '// hls-fpga-machine-learning insert dequeue' in line: + io_type = model.config.get_config_value("IOType") + if io_type == 'io_parallel': # TODO: handle this case + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + 'tmp.data = out_local[i];\n' + newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n' + newline += indent + indent + 'out.write(tmp);\n' + else: + newline += indent + indent + '#pragma HLS UNROLL\n' + newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n' + newline += indent + '}\n' + elif io_type == 'io_stream': + newline = '' + newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n' + # newline += indent + indent + '#pragma HLS PIPELINE\n' + newline += indent + indent + '{result_t} ctype = out_local.read();\n' + newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' + # newline += indent + indent + indent + '#pragma HLS UNROLL\n' + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n' + + newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n' + + newline += indent + indent + indent + 'out.write(tmp);\n' + else: + newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n' + newline += indent + indent + '}}\n' + newline += indent + '}}\n' + newline = newline.format(result_t=out.type.name) + else: + newline = line + fout.write(newline) + f.close() + fout.close() + + def modify_build_script(self, model): + ''' + Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + oldfile = f'{model.config.get_output_dir()}/build_prj.tcl' + newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl' + f = open(oldfile) + fout = open(newfile, 'w') + + for line in f.readlines(): + if 'set_top' in line: + newline = line[:-1] + '_axi\n' # remove the newline from the line end and append _axi for the new top + newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n' + elif f'{model.config.get_project_name()}_cosim' in line: + newline = line.replace( + f'{model.config.get_project_name()}_cosim', + f'{model.config.get_project_name()}_axi_cosim', + ) + elif '${project_name}.tcl' in line: + newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl') + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + ################### + # build_lib.sh + ################### + + f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh')) + fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w') + + for line in f.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + fout.write(line) + f.close() + fout.close() + + def write_wrapper_test(self, model): + ################### + # write myproject_test_wrapper.cpp + ################### + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp' + + inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types() + + f = open(oldfile) + fout = open(newfile, 'w') + + inp = model.get_input_variables()[0] + out = model.get_output_variables()[0] + io_type = model.config.get_config_value("IOType") + + for line in f.readlines(): + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') + elif inp.definition_cpp() in line: + newline = line.replace( + inp.definition_cpp(), 'hls::stream< dma_data_packet > inputs' + ) # TODO instead of replacing strings, how about we use proper variables and their definition? + elif out.definition_cpp() in line: + newline = line.replace(out.definition_cpp(), 'hls::stream< dma_data_packet > outputs') + elif 'unsigned short' in line: + newline = '' + elif f'{model.config.get_project_name()}(' in line: + indent_amount = line.split(model.config.get_project_name())[0] + newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n' + elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: + newline = ( + line.replace(inp.size_cpp(), 'N_IN') + .replace(inp.name, 'inputs') + .replace(inp.type.name, 'dma_data_packet') + ) + elif out.size_cpp() in line or out.name in line or out.type.name in line: + newline = ( + line.replace(out.size_cpp(), 'N_OUT') + .replace(out.name, 'outputs') + .replace(out.type.name, 'dma_data_packet') + ) + else: + newline = line + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + if 'copy_data' in line: + newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "") + + if io_type == 'io_stream': + if 'nnet::fill_zero' in line: + newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ") + # indent = line.split('n')[0] + # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n' + if 'print_result' in line: + newline = newline.replace("print_result<", f"print_result<{out.type.name}, ") + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + ################### + # write myproject_bridge_wrapper.cpp + ################### + oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp' + newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp' + + f = open(oldfile) + fout = open(newfile, 'w') + + inp = model.get_input_variables()[0] + out = model.get_output_variables()[0] + + for line in f.readlines(): + if f'{model.config.get_project_name()}.h' in line: + newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h') + elif inp.definition_cpp(name_suffix='_ap') in line: + newline = line.replace( + inp.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {inp.name}_ap' + ) + elif out.definition_cpp(name_suffix='_ap') in line: + newline = line.replace( + out.definition_cpp(name_suffix='_ap'), f'hls::stream< dma_data_packet > {out.name}_ap' + ) + elif f'{model.config.get_project_name()}(' in line: + indent_amount = line.split(model.config.get_project_name())[0] + newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format( + model.config.get_project_name(), inp.name, out.name + ) + elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: + newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t) + elif out.size_cpp() in line or out.name in line or out.type.name in line: + newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t) + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + def write_board_script(self, model): + ''' + Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + copyfile( + os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()), + f'{model.config.get_output_dir()}/design.tcl', + ) + + ################### + # project.tcl + ################### + f = open(f'{model.config.get_output_dir()}/project.tcl', 'w') + f.write('variable project_name\n') + f.write(f'set project_name "{model.config.get_project_name()}"\n') + f.write('variable backend\n') + f.write('set backend "vitisacceleratoripflow"\n') + f.write('variable part\n') + f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n') + f.write('variable clock_period\n') + f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod'))) + f.write('variable clock_uncertainty\n') + f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%'))) + f.write('variable version\n') + f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0'))) + if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream': + in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth() + f.write(f'set bit_width_hls_output {in_bit}\n') + f.write(f'set bit_width_hls_input {out_bit}\n') + f.close() + return + + def write_driver(self, model): + filedir = os.path.dirname(os.path.abspath(__file__)) + copyfile( + os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()), + ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()), + ) + + def write_new_tar(self, model): + # os.remove(model.config.get_output_dir() + '.tar.gz') + super().write_tar(model) + + def write_hls(self, model): + """ + Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface + """ + # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package + from hls4ml.backends import VitisAcceleratorIPFlowConfig + + self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig( + model.config, model.get_input_variables(), model.get_output_variables() + ) + super().write_hls(model) + self.write_board_script(model) + self.write_driver(model) + self.write_wrapper_test(model) + self.write_axi_wrapper(model) + self.modify_build_script(model) + self.write_new_tar(model) diff --git a/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py new file mode 100644 index 0000000000..00af95fce6 --- /dev/null +++ b/test/pytest/test_backend/test_vitis_accelerator_ip_flow.py @@ -0,0 +1,257 @@ +import json +import os +import re +from pathlib import Path + +import numpy as np +import pytest +import qonnx.core.onnx_exec as oxe +from qonnx.core.modelwrapper import ModelWrapper +from tensorflow.keras.layers import SeparableConv2D +from tensorflow.keras.models import Sequential + +import hls4ml + +test_root_path = Path(__file__).parent +example_model_path = (test_root_path / '../../../example-models').resolve() + +backend_options = ['VitisAcceleratorIPFlow'] + + +def parse_cosim_report_and_search_for_bitstream(project_path): + """Parse the cosimulation report to check whether the cosimulation passed or failed and therefore a deadlock is + detected and check if the bitstream was generated without errors. + """ + prj_dir = None + top_func_name = None + + project_tcl_path = project_path + '/project.tcl' + + with open(project_tcl_path) as f: + for line in f.readlines(): + if 'set project_name' in line: + top_func_name = line.split('"')[-2] + prj_dir = top_func_name + '_prj' + + cosim_file_path = project_path + '/' + prj_dir + f'/solution1/sim/report/{top_func_name}_axi_cosim.rpt' + bitsteam_path = ( + project_path + '/' + f"{top_func_name}_vitis_accelerator_ip_flow/project_1.runs/impl_1/design_1_wrapper.bit" + ) + + cosim_report_exists = os.path.isfile(cosim_file_path) + bitstream_exists = os.path.isfile(bitsteam_path) + + if cosim_report_exists and bitstream_exists: + return cosim_file_path, bitstream_exists + elif (not cosim_report_exists) and (not bitstream_exists): + raise FileNotFoundError("Co-simulation report and Bitstream not found.") + elif not cosim_report_exists: + raise FileNotFoundError("Co-simulation report not found.") + else: + raise FileNotFoundError("Bitstream not found.") + + +def run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type, run_fifo_depth_optimization): + """Execute bitstream generation on a dummy Keras model and the FIFO optimization sequence if + `run_fifo_depth_optimization` is set. + """ + + # create a keras model + input_shape = (16, 16, 3) + activation = 'relu' + kernel_size = (3, 3) + padding = 'same' + + model = Sequential() + model.add( + SeparableConv2D(filters=4, kernel_size=kernel_size, padding=padding, activation=activation, input_shape=input_shape) + ) + model.add(SeparableConv2D(filters=8, kernel_size=kernel_size, padding=padding, activation=activation)) + model.compile(optimizer='adam', loss='mse') + + X_input = np.random.rand(1, *input_shape) + keras_prediction = model.predict(X_input) + + config = hls4ml.utils.config_from_keras_model(model, default_precision='ap_fixed<32, 16>') + + # include the FIFO Depth optimizer do the flows if `run_fifo_depth_optimization` is set + if run_fifo_depth_optimization: + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) + + output_dir = str( + test_root_path / f'hls4mlprj_keras_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}' + ) + + hls_model = hls4ml.converters.convert_from_keras_model( + model, io_type=io_type, hls_config=config, output_dir=output_dir, backend=backend, clock_period=10 + ) + + hls_model.compile() + hls_prediction = hls_model.predict(X_input).reshape(keras_prediction.shape) + + np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=0.01) + + # build the hls4ml model and check if the bitstream was generated and the FIFOs were optimized if + # `run_fifo_depth_optimization` is set + build_and_check(hls_model, run_fifo_depth_optimization) + + +def build_and_check(hls_model, run_fifo_depth_optimization): + """Execute the FIFO depth optimization sequence on an hls4ml model.""" + + # try to generate a bitstream. Use the optimized FIFO depths and execute cosimulation to check for deadlocks + # due to the new FIFO depths if `run_fifo_depth_optimization` is set + hls_model.build( + reset=False, synth=True, csim=False, export=True, cosim=True, bitfile=True, fifo_opt=run_fifo_depth_optimization + ) + + # checks if the fifo depths decreased/were optimized + fifo_depths_decreased = False + if run_fifo_depth_optimization: + fifo_depths = {} + with open(hls_model.config.get_output_dir() + "/fifo_depths.json") as fifo_depths_file: + fifo_depths = json.load(fifo_depths_file) + + # omit checking for the input and output AXIS FIFOs as they are not always optimized + # as the last kernel e.g pointwise is faster than AXIS speed + fifo_depths_decreased = all( + fifo_depths['optimized'] < fifo_depths['initial'] + for fifo_name, fifo_depths in fifo_depths.items() + if fifo_name not in {'in_local', 'out_local'} + ) + + # checks that the cosimulation ran succesfully without detecting deadlocks and if the bitstream was generated + cosim_report_path, bitstream_exists = parse_cosim_report_and_search_for_bitstream(hls_model.config.get_output_dir()) + + with open(cosim_report_path) as cosim_report_file: + cosim_succesful = any("Pass" in line for line in cosim_report_file) + + assert (fifo_depths_decreased or (not run_fifo_depth_optimization)) and cosim_succesful and bitstream_exists + + +def expect_exception(error, message, backend, profiling_fifo_depth, io_type): + with pytest.raises(error, match=re.escape(message)): + run_bitstream_generation_keras(backend, profiling_fifo_depth, io_type) + + +def get_branched_model(): + """ + Load branched model, already channels-last and cleaned. + """ + dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx") + assert os.path.isfile(dl_file) + model = ModelWrapper(dl_file) + return model + + +def run_bitstream_generation_onnx(backend, profiling_fifo_depth, io_type, model, run_fifo_depth_optimization): + """Execute bitstream generation on a QONNX branched model and the FIFO optimization sequence if + `run_fifo_depth_optimization` is set. + """ + + ishape = tuple(model.get_tensor_shape(model.graph.input[0].name)) + X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape) + X = (np.round(X * 2**16) * 2**-16).astype(np.float32) + idict = {model.graph.input[0].name: X} + y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name] + + config = hls4ml.utils.config.config_from_onnx_model( + model, granularity='name', backend=backend, default_precision='ap_fixed<15,2,AP_RND_CONV>' + ) + + # add this line to remove the linear layer that quantizes the input of the NN + config['LayerName']['global_in']['Precision']['result'] = 'fixed<4,0,AP_RND_CONV,AP_SAT,0>' + + if run_fifo_depth_optimization: + config['Flows'] = ['vitisacceleratoripflow:fifo_depth_optimization'] + hls4ml.model.optimizer.get_optimizer('vitisacceleratoripflow:fifo_depth_optimization').configure( + profiling_fifo_depth=profiling_fifo_depth + ) + + output_dir = str( + test_root_path / f'hls4mlprj_branched_model_backend_{backend}_fifo_optimization_{run_fifo_depth_optimization}' + ) + + hls_model = hls4ml.converters.convert_from_onnx_model( + model, + output_dir=output_dir, + io_type=io_type, + backend=backend, + hls_config=config, + part="xczu9eg-ffvb1156-2-e", + board='zcu102', + clock_period=10, + ) + hls_model.compile() + y_hls4ml = hls_model.predict(np.ascontiguousarray(X)) + np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel()) + + build_and_check(hls_model, run_fifo_depth_optimization) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +@pytest.mark.parametrize('profiling_fifo_depth', [-2, 3.14, "a"]) +def test_value_error(backend, profiling_fifo_depth): + """Test the FIFO depth optimizer with faulty inputs of profiling_fifo_depth to verify that an exception is raised.""" + message = "The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer." + expect_exception( + ValueError, message, backend, profiling_fifo_depth, io_type='io_stream', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_runtime_error(backend): + """Test the FIFO depth optimizer with io_type='io_parallel' to verify that an exception is raised.""" + message = "To use this optimization you have to set `IOType` field to `io_stream` in the HLS config." + expect_exception( + RuntimeError, message, backend, profiling_fifo_depth=200_000, io_type='io_parallel', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras(backend): + """Test the correct execution of the bitstream generation.""" + run_bitstream_generation_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=False + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_dummy_keras_with_fifo_optimization(backend): + """Test the correct execution of the bitstream generation with the FIFO depth optimizer.""" + run_bitstream_generation_keras( + backend, profiling_fifo_depth=200_000, io_type='io_stream', run_fifo_depth_optimization=True + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_branched_model(backend): + """Test the correct execution of the bitstream generation.""" + run_bitstream_generation_onnx( + backend, + profiling_fifo_depth=200_000, + io_type='io_stream', + model=get_branched_model(), + run_fifo_depth_optimization=False, + ) + + +@pytest.mark.skip(reason='Skipping synthesis tests for now') +@pytest.mark.parametrize('backend', backend_options) +def test_successful_execution_of_branched_model_with_fifo_optimization(backend): + """Test the correct execution of the bitstream generation with the FIFO depth optimizer.""" + run_bitstream_generation_onnx( + backend, + profiling_fifo_depth=200_000, + io_type='io_stream', + model=get_branched_model(), + run_fifo_depth_optimization=True, + )